agent0/plugins/festinger/festinger/wordnet.py

"""
WordNet importer — loads Princeton WordNet 3.x index files into SOAS.

Reads index.noun, index.verb, index.adj, index.adv from the wordnet/ directory.
Each non-header line's first field is the lemma (already lowercase, underscores
for compound words — matches our compound token convention exactly).

All tokens are inserted with saliency=0, novelty=0 (common English baseline).
Insert is idempotent: ON CONFLICT DO NOTHING.

Citation:
  Princeton University "About WordNet." WordNet. Princeton University. 2010.
  https://wordnet.princeton.edu/
"""
from __future__ import annotations

import logging
from pathlib import Path
from typing import AsyncIterator

import asyncpg

from . import cache
from .cache import SoasRow

log = logging.getLogger("festinger.wordnet")

WORDNET_DIR = Path(__file__).parent.parent / "wordnet"
INDEX_FILES = ["index.noun", "index.verb", "index.adj", "index.adv"]
BATCH_SIZE = 2000

CITATION = (
    'Princeton University "About WordNet." WordNet. '
    "Princeton University. 2010. https://wordnet.princeton.edu/"
)


def _parse_index_file(path: Path) -> list[str]:
    """
    Extract lemma tokens from a WordNet index file.
    Header lines start with a space or are blank — skip them.
    Data line format: lemma  pos  synset_cnt  p_cnt  ...
    Lemmas are already lowercase; underscores join compound words.
    """
    tokens: list[str] = []
    try:
        with open(path, encoding="utf-8", errors="replace") as f:
            for line in f:
                if not line or line[0] in (" ", "\t", "\n"):
                    continue
                lemma = line.split()[0]
                # Skip purely numeric tokens and single chars
                if lemma and not lemma.isdigit() and len(lemma) > 1:
                    tokens.append(lemma)
    except FileNotFoundError:
        log.warning("wordnet file not found: %s", path)
    return tokens


def collect_all_lemmas() -> list[str]:
    """Parse all four index files and return a deduplicated list of lemmas."""
    seen: set[str] = set()
    result: list[str] = []
    for fname in INDEX_FILES:
        for token in _parse_index_file(WORDNET_DIR / fname):
            if token not in seen:
                seen.add(token)
                result.append(token)
    return result


async def import_wordnet(pool: asyncpg.Pool) -> dict:
    """
    Bulk-load all WordNet lemmas into SOAS (saliency=0, novelty=0).
    Updates the in-memory cache with any newly inserted tokens.
    Returns a summary dict.
    """
    if not WORDNET_DIR.exists():
        return {"error": f"wordnet directory not found at {WORDNET_DIR}"}

    lemmas = collect_all_lemmas()
    total = len(lemmas)
    log.info("wordnet: %d lemmas collected, beginning import …", total)

    inserted = 0
    skipped = 0

    async with pool.acquire() as conn:
        # Process in batches to avoid huge transactions
        for i in range(0, total, BATCH_SIZE):
            batch = lemmas[i : i + BATCH_SIZE]

            # INSERT … ON CONFLICT DO NOTHING, then RETURNING to know what was new
            rows = await conn.fetch(
                """
                INSERT INTO soas (token, saliency, novelty)
                SELECT unnest($1::text[]), 0.0, 0.0
                ON CONFLICT (token) DO NOTHING
                RETURNING id, token
                """,
                batch,
            )

            for r in rows:
                soas_row = SoasRow(id=r["id"], token=r["token"])
                cache.soas_by_token[r["token"]] = soas_row
                cache.soas_by_id[r["id"]] = r["token"]
                inserted += 1

            skipped += len(batch) - len(rows)

            if (i // BATCH_SIZE) % 10 == 0:
                log.info("wordnet import: %d / %d …", i + len(batch), total)

    log.info(
        "wordnet import complete: %d inserted, %d already present, %d total",
        inserted, skipped, total,
    )
    return {
        "status": "ok",
        "total_lemmas": total,
        "inserted": inserted,
        "already_present": skipped,
        "citation": CITATION,
    }