Adding Festinger with wordnet
This commit is contained in:
@@ -0,0 +1,125 @@
|
||||
"""
|
||||
WordNet importer — loads Princeton WordNet 3.x index files into SOAS.
|
||||
|
||||
Reads index.noun, index.verb, index.adj, index.adv from the wordnet/ directory.
|
||||
Each non-header line's first field is the lemma (already lowercase, underscores
|
||||
for compound words — matches our compound token convention exactly).
|
||||
|
||||
All tokens are inserted with saliency=0, novelty=0 (common English baseline).
|
||||
Insert is idempotent: ON CONFLICT DO NOTHING.
|
||||
|
||||
Citation:
|
||||
Princeton University "About WordNet." WordNet. Princeton University. 2010.
|
||||
https://wordnet.princeton.edu/
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import AsyncIterator
|
||||
|
||||
import asyncpg
|
||||
|
||||
from . import cache
|
||||
from .cache import SoasRow
|
||||
|
||||
log = logging.getLogger("festinger.wordnet")
|
||||
|
||||
WORDNET_DIR = Path(__file__).parent.parent / "wordnet"
|
||||
INDEX_FILES = ["index.noun", "index.verb", "index.adj", "index.adv"]
|
||||
BATCH_SIZE = 2000
|
||||
|
||||
CITATION = (
|
||||
'Princeton University "About WordNet." WordNet. '
|
||||
"Princeton University. 2010. https://wordnet.princeton.edu/"
|
||||
)
|
||||
|
||||
|
||||
def _parse_index_file(path: Path) -> list[str]:
|
||||
"""
|
||||
Extract lemma tokens from a WordNet index file.
|
||||
Header lines start with a space or are blank — skip them.
|
||||
Data line format: lemma pos synset_cnt p_cnt ...
|
||||
Lemmas are already lowercase; underscores join compound words.
|
||||
"""
|
||||
tokens: list[str] = []
|
||||
try:
|
||||
with open(path, encoding="utf-8", errors="replace") as f:
|
||||
for line in f:
|
||||
if not line or line[0] in (" ", "\t", "\n"):
|
||||
continue
|
||||
lemma = line.split()[0]
|
||||
# Skip purely numeric tokens and single chars
|
||||
if lemma and not lemma.isdigit() and len(lemma) > 1:
|
||||
tokens.append(lemma)
|
||||
except FileNotFoundError:
|
||||
log.warning("wordnet file not found: %s", path)
|
||||
return tokens
|
||||
|
||||
|
||||
def collect_all_lemmas() -> list[str]:
|
||||
"""Parse all four index files and return a deduplicated list of lemmas."""
|
||||
seen: set[str] = set()
|
||||
result: list[str] = []
|
||||
for fname in INDEX_FILES:
|
||||
for token in _parse_index_file(WORDNET_DIR / fname):
|
||||
if token not in seen:
|
||||
seen.add(token)
|
||||
result.append(token)
|
||||
return result
|
||||
|
||||
|
||||
async def import_wordnet(pool: asyncpg.Pool) -> dict:
|
||||
"""
|
||||
Bulk-load all WordNet lemmas into SOAS (saliency=0, novelty=0).
|
||||
Updates the in-memory cache with any newly inserted tokens.
|
||||
Returns a summary dict.
|
||||
"""
|
||||
if not WORDNET_DIR.exists():
|
||||
return {"error": f"wordnet directory not found at {WORDNET_DIR}"}
|
||||
|
||||
lemmas = collect_all_lemmas()
|
||||
total = len(lemmas)
|
||||
log.info("wordnet: %d lemmas collected, beginning import …", total)
|
||||
|
||||
inserted = 0
|
||||
skipped = 0
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
# Process in batches to avoid huge transactions
|
||||
for i in range(0, total, BATCH_SIZE):
|
||||
batch = lemmas[i : i + BATCH_SIZE]
|
||||
|
||||
# INSERT … ON CONFLICT DO NOTHING, then RETURNING to know what was new
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
INSERT INTO soas (token, saliency, novelty)
|
||||
SELECT unnest($1::text[]), 0.0, 0.0
|
||||
ON CONFLICT (token) DO NOTHING
|
||||
RETURNING id, token
|
||||
""",
|
||||
batch,
|
||||
)
|
||||
|
||||
for r in rows:
|
||||
soas_row = SoasRow(id=r["id"], token=r["token"])
|
||||
cache.soas_by_token[r["token"]] = soas_row
|
||||
cache.soas_by_id[r["id"]] = r["token"]
|
||||
inserted += 1
|
||||
|
||||
skipped += len(batch) - len(rows)
|
||||
|
||||
if (i // BATCH_SIZE) % 10 == 0:
|
||||
log.info("wordnet import: %d / %d …", i + len(batch), total)
|
||||
|
||||
log.info(
|
||||
"wordnet import complete: %d inserted, %d already present, %d total",
|
||||
inserted, skipped, total,
|
||||
)
|
||||
return {
|
||||
"status": "ok",
|
||||
"total_lemmas": total,
|
||||
"inserted": inserted,
|
||||
"already_present": skipped,
|
||||
"citation": CITATION,
|
||||
}
|
||||
Reference in New Issue
Block a user