126 lines
4.0 KiB
Python
126 lines
4.0 KiB
Python
|
|
"""
|
||
|
|
WordNet importer — loads Princeton WordNet 3.x index files into SOAS.
|
||
|
|
|
||
|
|
Reads index.noun, index.verb, index.adj, index.adv from the wordnet/ directory.
|
||
|
|
Each non-header line's first field is the lemma (already lowercase, underscores
|
||
|
|
for compound words — matches our compound token convention exactly).
|
||
|
|
|
||
|
|
All tokens are inserted with saliency=0, novelty=0 (common English baseline).
|
||
|
|
Insert is idempotent: ON CONFLICT DO NOTHING.
|
||
|
|
|
||
|
|
Citation:
|
||
|
|
Princeton University "About WordNet." WordNet. Princeton University. 2010.
|
||
|
|
https://wordnet.princeton.edu/
|
||
|
|
"""
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import logging
|
||
|
|
from pathlib import Path
|
||
|
|
from typing import AsyncIterator
|
||
|
|
|
||
|
|
import asyncpg
|
||
|
|
|
||
|
|
from . import cache
|
||
|
|
from .cache import SoasRow
|
||
|
|
|
||
|
|
log = logging.getLogger("festinger.wordnet")
|
||
|
|
|
||
|
|
WORDNET_DIR = Path(__file__).parent.parent / "wordnet"
|
||
|
|
INDEX_FILES = ["index.noun", "index.verb", "index.adj", "index.adv"]
|
||
|
|
BATCH_SIZE = 2000
|
||
|
|
|
||
|
|
CITATION = (
|
||
|
|
'Princeton University "About WordNet." WordNet. '
|
||
|
|
"Princeton University. 2010. https://wordnet.princeton.edu/"
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def _parse_index_file(path: Path) -> list[str]:
|
||
|
|
"""
|
||
|
|
Extract lemma tokens from a WordNet index file.
|
||
|
|
Header lines start with a space or are blank — skip them.
|
||
|
|
Data line format: lemma pos synset_cnt p_cnt ...
|
||
|
|
Lemmas are already lowercase; underscores join compound words.
|
||
|
|
"""
|
||
|
|
tokens: list[str] = []
|
||
|
|
try:
|
||
|
|
with open(path, encoding="utf-8", errors="replace") as f:
|
||
|
|
for line in f:
|
||
|
|
if not line or line[0] in (" ", "\t", "\n"):
|
||
|
|
continue
|
||
|
|
lemma = line.split()[0]
|
||
|
|
# Skip purely numeric tokens and single chars
|
||
|
|
if lemma and not lemma.isdigit() and len(lemma) > 1:
|
||
|
|
tokens.append(lemma)
|
||
|
|
except FileNotFoundError:
|
||
|
|
log.warning("wordnet file not found: %s", path)
|
||
|
|
return tokens
|
||
|
|
|
||
|
|
|
||
|
|
def collect_all_lemmas() -> list[str]:
|
||
|
|
"""Parse all four index files and return a deduplicated list of lemmas."""
|
||
|
|
seen: set[str] = set()
|
||
|
|
result: list[str] = []
|
||
|
|
for fname in INDEX_FILES:
|
||
|
|
for token in _parse_index_file(WORDNET_DIR / fname):
|
||
|
|
if token not in seen:
|
||
|
|
seen.add(token)
|
||
|
|
result.append(token)
|
||
|
|
return result
|
||
|
|
|
||
|
|
|
||
|
|
async def import_wordnet(pool: asyncpg.Pool) -> dict:
|
||
|
|
"""
|
||
|
|
Bulk-load all WordNet lemmas into SOAS (saliency=0, novelty=0).
|
||
|
|
Updates the in-memory cache with any newly inserted tokens.
|
||
|
|
Returns a summary dict.
|
||
|
|
"""
|
||
|
|
if not WORDNET_DIR.exists():
|
||
|
|
return {"error": f"wordnet directory not found at {WORDNET_DIR}"}
|
||
|
|
|
||
|
|
lemmas = collect_all_lemmas()
|
||
|
|
total = len(lemmas)
|
||
|
|
log.info("wordnet: %d lemmas collected, beginning import …", total)
|
||
|
|
|
||
|
|
inserted = 0
|
||
|
|
skipped = 0
|
||
|
|
|
||
|
|
async with pool.acquire() as conn:
|
||
|
|
# Process in batches to avoid huge transactions
|
||
|
|
for i in range(0, total, BATCH_SIZE):
|
||
|
|
batch = lemmas[i : i + BATCH_SIZE]
|
||
|
|
|
||
|
|
# INSERT … ON CONFLICT DO NOTHING, then RETURNING to know what was new
|
||
|
|
rows = await conn.fetch(
|
||
|
|
"""
|
||
|
|
INSERT INTO soas (token, saliency, novelty)
|
||
|
|
SELECT unnest($1::text[]), 0.0, 0.0
|
||
|
|
ON CONFLICT (token) DO NOTHING
|
||
|
|
RETURNING id, token
|
||
|
|
""",
|
||
|
|
batch,
|
||
|
|
)
|
||
|
|
|
||
|
|
for r in rows:
|
||
|
|
soas_row = SoasRow(id=r["id"], token=r["token"])
|
||
|
|
cache.soas_by_token[r["token"]] = soas_row
|
||
|
|
cache.soas_by_id[r["id"]] = r["token"]
|
||
|
|
inserted += 1
|
||
|
|
|
||
|
|
skipped += len(batch) - len(rows)
|
||
|
|
|
||
|
|
if (i // BATCH_SIZE) % 10 == 0:
|
||
|
|
log.info("wordnet import: %d / %d …", i + len(batch), total)
|
||
|
|
|
||
|
|
log.info(
|
||
|
|
"wordnet import complete: %d inserted, %d already present, %d total",
|
||
|
|
inserted, skipped, total,
|
||
|
|
)
|
||
|
|
return {
|
||
|
|
"status": "ok",
|
||
|
|
"total_lemmas": total,
|
||
|
|
"inserted": inserted,
|
||
|
|
"already_present": skipped,
|
||
|
|
"citation": CITATION,
|
||
|
|
}
|