Adding llm to do this

2026-04-21 18:32:21 +02:00
parent 314f145740
commit 128dd653e7
3 changed files with 125 additions and 70 deletions
@@ -44,7 +44,7 @@ from .cue_scanner import scan_cues
 from .recollection import build_recollection_block, inject_recollection
 from .resolution_job import run_resolution_job, last_run_timestamp
 from .tokenizer import tokenize
-from .write_queue import enqueue_concept, enqueue_cue, start_worker, stop_worker
+from .write_queue import enqueue_context_extract, enqueue_cue, start_worker, stop_worker
 from .urd_writer import InsertRequest, insert_urd_edge
 from .wordnet import import_wordnet, CITATION as WORDNET_CITATION
 from .test_scenarios import SCENARIOS, seed_scenario, reset_scenario
@@ -528,11 +528,10 @@ async def process_prompt(body: dict, path: str, pool, cfg: dict) -> dict:
    # 2. Tokenise + update saliency
    tokens = tokenize(prompt_text)
    salient_for_read: list[int] = []
-    salient_for_write: list[str] = []

-    # Novel words found this prompt that aren't in the cache yet.
-    # We cap at MAX_NOVEL_PER_PROMPT to avoid flooding on large system prompts.
-    MAX_NOVEL_PER_PROMPT = 5
+    # Novel domain words found in this turn — not in the standard dictionary.
+    # Capped to avoid flooding on unexpectedly large turns.
+    MAX_NOVEL_PER_PROMPT = 8
    novel_this_prompt: list[str] = []

    for token in tokens:
@@ -540,7 +539,7 @@ async def process_prompt(body: dict, path: str, pool, cfg: dict) -> dict:

        if soas_row is None:
            # Token absent from dictionary → candidate novel domain word.
-            # Skip structural tokens (paths, versions, numbers) and apply a per-prompt cap.
+            # Skip structural tokens (paths, versions, numbers).
            if not _is_structural_token(token) and len(novel_this_prompt) < MAX_NOVEL_PER_PROMPT:
                novel_this_prompt.append(token)
            continue
@@ -554,16 +553,6 @@ async def process_prompt(body: dict, path: str, pool, cfg: dict) -> dict:
        if soas_row.saliency >= read_threshold:
            salient_for_read.append(soas_row.id)

-        # Only enqueue for LLM write if the concept already has URD edges —
-        # i.e. we know *something* about it and may want to expand that knowledge.
-        # Never enqueue freshly-novel words: let the conversation teach us instead.
-        if (
-            soas_row.saliency >= write_threshold
-            and soas_row.novelty > 0.0
-            and cache.urd_by_concept.get(soas_row.id)
-        ):
-            salient_for_write.append(token)
-
    # Create SOAS entries for novel words and add them to the read list.
    # Capture first-seen context so zero-hit recollection can include a hint.
    for token in novel_this_prompt:
@@ -571,8 +560,11 @@ async def process_prompt(body: dict, path: str, pool, cfg: dict) -> dict:
        soas_row = await create_novel_soas(pool, token, context=ctx)
        salient_for_read.append(soas_row.id)

-    for token in salient_for_write:
-        await enqueue_concept(token)
+    # Enqueue context-aware LLM extraction for all novel words found this turn.
+    # The LLM reads the actual conversation text and extracts relationships from
+    # evidence — one call per turn, not one per concept.
+    if novel_this_prompt:
+        await enqueue_context_extract(novel_this_prompt, prompt_text)

    if not salient_for_read:
        return body