plugins/festinger/festinger/wordnet.py

"""
WordNet importer — loads Princeton WordNet 3.x index files into SOAS.

Reads index.noun, index.verb, index.adj, index.adv from the wordnet/ directory.
Each non-header line's first field is the lemma (already lowercase, underscores
for compound words — matches our compound token convention exactly).

All tokens are inserted with saliency=0, novelty=0 (common English baseline).
Insert is idempotent: ON CONFLICT DO NOTHING.

Citation:
  Princeton University "About WordNet." WordNet. Princeton University. 2010.
  https://wordnet.princeton.edu/
"""
from __future__ import annotations

import logging
from pathlib import Path
from typing import AsyncIterator

import asyncpg

from . import cache
from .cache import SoasRow

log = logging.getLogger("festinger.wordnet")

WORDNET_DIR = Path(__file__).parent.parent / "wordnet"
INDEX_FILES = ["index.noun", "index.verb", "index.adj", "index.adv"]
BATCH_SIZE = 2000

CITATION = (
    'Princeton University "About WordNet." WordNet. '
    "Princeton University. 2010. https://wordnet.princeton.edu/"
)


def _parse_index_file(path: Path) -> list[str]:
    """
    Extract lemma tokens from a WordNet index file.
    Header lines start with a space or are blank — skip them.
    Data line format: lemma  pos  synset_cnt  p_cnt  ...
    Lemmas are already lowercase; underscores join compound words.
    """
    tokens: list[str] = []
    try:
        with open(path, encoding="utf-8", errors="replace") as f:
            for line in f:
                if not line or line[0] in (" ", "\t", "\n"):
                    continue
                lemma = line.split()[0]
                # Skip purely numeric tokens and single chars
                if lemma and not lemma.isdigit() and len(lemma) > 1:
                    tokens.append(lemma)
    except FileNotFoundError:
        log.warning("wordnet file not found: %s", path)
    return tokens


def collect_all_lemmas() -> list[str]:
    """Parse all four index files and return a deduplicated list of lemmas."""
    seen: set[str] = set()
    result: list[str] = []
    for fname in INDEX_FILES:
        for token in _parse_index_file(WORDNET_DIR / fname):
            if token not in seen:
                seen.add(token)
                result.append(token)
    return result


async def import_wordnet(pool: asyncpg.Pool) -> dict:
    """
    Bulk-load all WordNet lemmas into SOAS (saliency=0, novelty=0).
    Updates the in-memory cache with any newly inserted tokens.
    Returns a summary dict.
    """
    if not WORDNET_DIR.exists():
        return {"error": f"wordnet directory not found at {WORDNET_DIR}"}

    lemmas = collect_all_lemmas()
    total = len(lemmas)
    log.info("wordnet: %d lemmas collected, beginning import …", total)

    inserted = 0
    skipped = 0

    async with pool.acquire() as conn:
        # Process in batches to avoid huge transactions
        for i in range(0, total, BATCH_SIZE):
            batch = lemmas[i : i + BATCH_SIZE]

            # INSERT … ON CONFLICT DO NOTHING, then RETURNING to know what was new
            rows = await conn.fetch(
                """
                INSERT INTO soas (token, saliency, novelty)
                SELECT unnest($1::text[]), 0.0, 0.0
                ON CONFLICT (token) DO NOTHING
                RETURNING id, token
                """,
                batch,
            )

            for r in rows:
                soas_row = SoasRow(id=r["id"], token=r["token"])
                cache.soas_by_token[r["token"]] = soas_row
                cache.soas_by_id[r["id"]] = r["token"]
                inserted += 1

            skipped += len(batch) - len(rows)

            if (i // BATCH_SIZE) % 10 == 0:
                log.info("wordnet import: %d / %d …", i + len(batch), total)

    log.info(
        "wordnet import complete: %d inserted, %d already present, %d total",
        inserted, skipped, total,
    )
    return {
        "status": "ok",
        "total_lemmas": total,
        "inserted": inserted,
        "already_present": skipped,
        "citation": CITATION,
    }
Adding Festinger with wordnet 2026-04-19 16:16:13 +02:00			`"""`
			`WordNet importer — loads Princeton WordNet 3.x index files into SOAS.`

			`Reads index.noun, index.verb, index.adj, index.adv from the wordnet/ directory.`
			`Each non-header line's first field is the lemma (already lowercase, underscores`
			`for compound words — matches our compound token convention exactly).`

			`All tokens are inserted with saliency=0, novelty=0 (common English baseline).`
			`Insert is idempotent: ON CONFLICT DO NOTHING.`

			`Citation:`
			`Princeton University "About WordNet." WordNet. Princeton University. 2010.`
			`https://wordnet.princeton.edu/`
			`"""`
			`from __future__ import annotations`

			`import logging`
			`from pathlib import Path`
			`from typing import AsyncIterator`

			`import asyncpg`

			`from . import cache`
			`from .cache import SoasRow`

			`log = logging.getLogger("festinger.wordnet")`

			`WORDNET_DIR = Path(__file__).parent.parent / "wordnet"`
			`INDEX_FILES = ["index.noun", "index.verb", "index.adj", "index.adv"]`
			`BATCH_SIZE = 2000`

			`CITATION = (`
			`'Princeton University "About WordNet." WordNet. '`
			`"Princeton University. 2010. https://wordnet.princeton.edu/"`
			`)`


			`def _parse_index_file(path: Path) -> list[str]:`
			`"""`
			`Extract lemma tokens from a WordNet index file.`
			`Header lines start with a space or are blank — skip them.`
			`Data line format: lemma pos synset_cnt p_cnt ...`
			`Lemmas are already lowercase; underscores join compound words.`
			`"""`
			`tokens: list[str] = []`
			`try:`
			`with open(path, encoding="utf-8", errors="replace") as f:`
			`for line in f:`
			`if not line or line[0] in (" ", "\t", "\n"):`
			`continue`
			`lemma = line.split()[0]`
			`# Skip purely numeric tokens and single chars`
			`if lemma and not lemma.isdigit() and len(lemma) > 1:`
			`tokens.append(lemma)`
			`except FileNotFoundError:`
			`log.warning("wordnet file not found: %s", path)`
			`return tokens`


			`def collect_all_lemmas() -> list[str]:`
			`"""Parse all four index files and return a deduplicated list of lemmas."""`
			`seen: set[str] = set()`
			`result: list[str] = []`
			`for fname in INDEX_FILES:`
			`for token in _parse_index_file(WORDNET_DIR / fname):`
			`if token not in seen:`
			`seen.add(token)`
			`result.append(token)`
			`return result`


			`async def import_wordnet(pool: asyncpg.Pool) -> dict:`
			`"""`
			`Bulk-load all WordNet lemmas into SOAS (saliency=0, novelty=0).`
			`Updates the in-memory cache with any newly inserted tokens.`
			`Returns a summary dict.`
			`"""`
			`if not WORDNET_DIR.exists():`
			`return {"error": f"wordnet directory not found at {WORDNET_DIR}"}`

			`lemmas = collect_all_lemmas()`
			`total = len(lemmas)`
			`log.info("wordnet: %d lemmas collected, beginning import …", total)`

			`inserted = 0`
			`skipped = 0`

			`async with pool.acquire() as conn:`
			`# Process in batches to avoid huge transactions`
			`for i in range(0, total, BATCH_SIZE):`
			`batch = lemmas[i : i + BATCH_SIZE]`

			`# INSERT … ON CONFLICT DO NOTHING, then RETURNING to know what was new`
			`rows = await conn.fetch(`
			`"""`
			`INSERT INTO soas (token, saliency, novelty)`
			`SELECT unnest($1::text[]), 0.0, 0.0`
			`ON CONFLICT (token) DO NOTHING`
			`RETURNING id, token`
			`""",`
			`batch,`
			`)`

			`for r in rows:`
			`soas_row = SoasRow(id=r["id"], token=r["token"])`
			`cache.soas_by_token[r["token"]] = soas_row`
			`cache.soas_by_id[r["id"]] = r["token"]`
			`inserted += 1`

			`skipped += len(batch) - len(rows)`

			`if (i // BATCH_SIZE) % 10 == 0:`
			`log.info("wordnet import: %d / %d …", i + len(batch), total)`

			`log.info(`
			`"wordnet import complete: %d inserted, %d already present, %d total",`
			`inserted, skipped, total,`
			`)`
			`return {`
			`"status": "ok",`
			`"total_lemmas": total,`
			`"inserted": inserted,`
			`"already_present": skipped,`
			`"citation": CITATION,`
			`}`