Files
agent0/plugins/festinger/festinger/wordnet.py
T
2026-04-19 16:16:13 +02:00

126 lines
4.0 KiB
Python

"""
WordNet importer — loads Princeton WordNet 3.x index files into SOAS.
Reads index.noun, index.verb, index.adj, index.adv from the wordnet/ directory.
Each non-header line's first field is the lemma (already lowercase, underscores
for compound words — matches our compound token convention exactly).
All tokens are inserted with saliency=0, novelty=0 (common English baseline).
Insert is idempotent: ON CONFLICT DO NOTHING.
Citation:
Princeton University "About WordNet." WordNet. Princeton University. 2010.
https://wordnet.princeton.edu/
"""
from __future__ import annotations
import logging
from pathlib import Path
from typing import AsyncIterator
import asyncpg
from . import cache
from .cache import SoasRow
log = logging.getLogger("festinger.wordnet")
WORDNET_DIR = Path(__file__).parent.parent / "wordnet"
INDEX_FILES = ["index.noun", "index.verb", "index.adj", "index.adv"]
BATCH_SIZE = 2000
CITATION = (
'Princeton University "About WordNet." WordNet. '
"Princeton University. 2010. https://wordnet.princeton.edu/"
)
def _parse_index_file(path: Path) -> list[str]:
"""
Extract lemma tokens from a WordNet index file.
Header lines start with a space or are blank — skip them.
Data line format: lemma pos synset_cnt p_cnt ...
Lemmas are already lowercase; underscores join compound words.
"""
tokens: list[str] = []
try:
with open(path, encoding="utf-8", errors="replace") as f:
for line in f:
if not line or line[0] in (" ", "\t", "\n"):
continue
lemma = line.split()[0]
# Skip purely numeric tokens and single chars
if lemma and not lemma.isdigit() and len(lemma) > 1:
tokens.append(lemma)
except FileNotFoundError:
log.warning("wordnet file not found: %s", path)
return tokens
def collect_all_lemmas() -> list[str]:
"""Parse all four index files and return a deduplicated list of lemmas."""
seen: set[str] = set()
result: list[str] = []
for fname in INDEX_FILES:
for token in _parse_index_file(WORDNET_DIR / fname):
if token not in seen:
seen.add(token)
result.append(token)
return result
async def import_wordnet(pool: asyncpg.Pool) -> dict:
"""
Bulk-load all WordNet lemmas into SOAS (saliency=0, novelty=0).
Updates the in-memory cache with any newly inserted tokens.
Returns a summary dict.
"""
if not WORDNET_DIR.exists():
return {"error": f"wordnet directory not found at {WORDNET_DIR}"}
lemmas = collect_all_lemmas()
total = len(lemmas)
log.info("wordnet: %d lemmas collected, beginning import …", total)
inserted = 0
skipped = 0
async with pool.acquire() as conn:
# Process in batches to avoid huge transactions
for i in range(0, total, BATCH_SIZE):
batch = lemmas[i : i + BATCH_SIZE]
# INSERT … ON CONFLICT DO NOTHING, then RETURNING to know what was new
rows = await conn.fetch(
"""
INSERT INTO soas (token, saliency, novelty)
SELECT unnest($1::text[]), 0.0, 0.0
ON CONFLICT (token) DO NOTHING
RETURNING id, token
""",
batch,
)
for r in rows:
soas_row = SoasRow(id=r["id"], token=r["token"])
cache.soas_by_token[r["token"]] = soas_row
cache.soas_by_id[r["id"]] = r["token"]
inserted += 1
skipped += len(batch) - len(rows)
if (i // BATCH_SIZE) % 10 == 0:
log.info("wordnet import: %d / %d", i + len(batch), total)
log.info(
"wordnet import complete: %d inserted, %d already present, %d total",
inserted, skipped, total,
)
return {
"status": "ok",
"total_lemmas": total,
"inserted": inserted,
"already_present": skipped,
"citation": CITATION,
}