From 2656951bb9cc89765bed8965170cefd71cba66f2 Mon Sep 17 00:00:00 2001 From: jenstandstad Date: Tue, 21 Apr 2026 17:52:27 +0200 Subject: [PATCH] Fixes --- plugins/festinger/festinger/cache.py | 5 ++ plugins/festinger/festinger/db.py | 26 +++++++--- plugins/festinger/festinger/main.py | 55 +++++++++++++++++---- plugins/festinger/festinger/recollection.py | 30 ++++++++--- plugins/festinger/festinger/urd_writer.py | 1 + 5 files changed, 93 insertions(+), 24 deletions(-) diff --git a/plugins/festinger/festinger/cache.py b/plugins/festinger/festinger/cache.py index 7e61ca9..ed13bbc 100644 --- a/plugins/festinger/festinger/cache.py +++ b/plugins/festinger/festinger/cache.py @@ -17,6 +17,7 @@ class SoasRow: encounter_count: int = 0 saliency: float = 0.0 novelty: float = 0.0 + first_seen_context: str = "" @dataclass @@ -48,6 +49,10 @@ urd_by_concept: dict[int, list[UrdEdge]] = {} # Mirrors Postgres UNIQUE index on (id, dim_id) exactly urd_by_concept_dim: dict[tuple[int, int], UrdEdge] = {} +# URD — reverse lookup: parent_id → list of edges where this id is a parent +# Used for bidirectional recollection (surface "gnommoweb is in omega13" when omega13 is mentioned) +urd_by_parent: dict[int, list[UrdEdge]] = {} + # Concepts with pending items in resolution_queue pending_conflicts: set[int] = set() diff --git a/plugins/festinger/festinger/db.py b/plugins/festinger/festinger/db.py index e7f62a6..e66d23f 100644 --- a/plugins/festinger/festinger/db.py +++ b/plugins/festinger/festinger/db.py @@ -56,6 +56,10 @@ async def init_schema(pool: asyncpg.Pool) -> None: await conn.execute( "ALTER TABLE models ADD COLUMN IF NOT EXISTS base_url TEXT NOT NULL DEFAULT ''" ) + # Migration: first_seen_context stores the sentence where a novel word first appeared + await conn.execute( + "ALTER TABLE soas ADD COLUMN IF NOT EXISTS first_seen_context TEXT NOT NULL DEFAULT ''" + ) log.info("schema applied") @@ -121,7 +125,7 @@ async def warm_cache(pool: asyncpg.Pool) -> None: """Load all SOAS and URD rows into the in-memory cache.""" async with pool.acquire() as conn: soas_rows = await conn.fetch( - "SELECT id, token, encounter_count, saliency, novelty FROM soas" + "SELECT id, token, encounter_count, saliency, novelty, first_seen_context FROM soas" ) for r in soas_rows: row = SoasRow( @@ -130,6 +134,7 @@ async def warm_cache(pool: asyncpg.Pool) -> None: encounter_count=r["encounter_count"], saliency=r["saliency"], novelty=r["novelty"], + first_seen_context=r["first_seen_context"] or "", ) cache.soas_by_token[r["token"]] = row cache.soas_by_id[r["id"]] = r["token"] @@ -158,6 +163,7 @@ async def warm_cache(pool: asyncpg.Pool) -> None: ) cache.urd_by_concept.setdefault(r["id"], []).append(edge) cache.urd_by_concept_dim[(r["id"], r["dim_id"])] = edge + cache.urd_by_parent.setdefault(r["parent_id"], []).append(edge) pending = await conn.fetch( "SELECT DISTINCT concept_id FROM resolution_queue WHERE status = 'pending'" @@ -176,6 +182,7 @@ async def reload_urd_cache(pool: asyncpg.Pool) -> None: """Rebuild only URD cache (called after nightly resolution job).""" cache.urd_by_concept.clear() cache.urd_by_concept_dim.clear() + cache.urd_by_parent.clear() async with pool.acquire() as conn: urd_rows = await conn.fetch( """ @@ -200,6 +207,7 @@ async def reload_urd_cache(pool: asyncpg.Pool) -> None: ) cache.urd_by_concept.setdefault(r["id"], []).append(edge) cache.urd_by_concept_dim[(r["id"], r["dim_id"])] = edge + cache.urd_by_parent.setdefault(r["parent_id"], []).append(edge) pending = await conn.fetch( "SELECT DISTINCT concept_id FROM resolution_queue WHERE status = 'pending'" @@ -221,7 +229,7 @@ async def get_or_create_soas(pool: asyncpg.Pool, token: str) -> SoasRow: row = await conn.fetchrow( "INSERT INTO soas (token) VALUES ($1) " "ON CONFLICT (token) DO UPDATE SET token = EXCLUDED.token " - "RETURNING id, token, encounter_count, saliency, novelty", + "RETURNING id, token, encounter_count, saliency, novelty, first_seen_context", token, ) @@ -231,6 +239,7 @@ async def get_or_create_soas(pool: asyncpg.Pool, token: str) -> SoasRow: encounter_count=row["encounter_count"], saliency=row["saliency"], novelty=row["novelty"], + first_seen_context=row["first_seen_context"] or "", ) cache.soas_by_token[token] = soas_row cache.soas_by_id[row["id"]] = token @@ -247,24 +256,25 @@ async def get_or_create_soas(pool: asyncpg.Pool, token: str) -> SoasRow: NOVEL_INITIAL_SALIENCY = 2.0 -async def create_novel_soas(pool: asyncpg.Pool, token: str) -> SoasRow: +async def create_novel_soas(pool: asyncpg.Pool, token: str, context: str = "") -> SoasRow: """ Insert a domain-specific (non-dictionary) token with an initial saliency high enough to trigger recollection on the very first encounter. novelty=1.0 distinguishes these rows from common-English seeds. + context is the sentence fragment where the word was first encountered. Idempotent — returns the existing row if already present. """ async with pool.acquire() as conn: row = await conn.fetchrow( """ - INSERT INTO soas (token, saliency, novelty, encounter_count) - VALUES ($1, $2, 1.0, 1) + INSERT INTO soas (token, saliency, novelty, encounter_count, first_seen_context) + VALUES ($1, $2, 1.0, 1, $3) ON CONFLICT (token) DO UPDATE SET encounter_count = soas.encounter_count + 1, last_seen = now() - RETURNING id, token, encounter_count, saliency, novelty + RETURNING id, token, encounter_count, saliency, novelty, first_seen_context """, - token, NOVEL_INITIAL_SALIENCY, + token, NOVEL_INITIAL_SALIENCY, context, ) soas_row = SoasRow( id=row["id"], @@ -272,6 +282,7 @@ async def create_novel_soas(pool: asyncpg.Pool, token: str) -> SoasRow: encounter_count=row["encounter_count"], saliency=row["saliency"], novelty=row["novelty"], + first_seen_context=row["first_seen_context"] or "", ) cache.soas_by_token[token] = soas_row cache.soas_by_id[row["id"]] = token @@ -314,6 +325,7 @@ async def reset_graph(pool: asyncpg.Pool) -> dict: # Clear in-memory state cache.urd_by_concept.clear() cache.urd_by_concept_dim.clear() + cache.urd_by_parent.clear() cache.pending_conflicts.clear() # Remove domain words from SOAS cache (keep novelty=0 entries) domain_tokens = [t for t, r in list(cache.soas_by_token.items()) if r.novelty > 0] diff --git a/plugins/festinger/festinger/main.py b/plugins/festinger/festinger/main.py index 6028db2..25b67a9 100644 --- a/plugins/festinger/festinger/main.py +++ b/plugins/festinger/festinger/main.py @@ -20,6 +20,7 @@ from __future__ import annotations import json import logging import os +import re import time from contextlib import asynccontextmanager from pathlib import Path @@ -435,15 +436,49 @@ def _extract_text_strings(content) -> list[str]: return [] +# Tokens that look structural/technical — skip novel-word detection for these. +# Matches: paths (foo/bar), emails (a@b), file extensions (foo.py), dotted names (1.2.3), +# pure numbers, hex literals/colours. +_STRUCTURAL_RE = re.compile( + r"[/@]" # URL-like separator or email @ + r"|\.\w" # dotted extension or namespace (e.g. foo.py, omega.13) + r"|^\d+$" # pure digits + r"|^\d[\d.]+\d$" # version string like 1.2 or 3.4.5 + r"|^#[0-9a-f]{3,6}$" # hex colour + r"|^0x[0-9a-f]+$", # hex literal + re.IGNORECASE, +) + + +def _is_structural_token(token: str) -> bool: + """Return True if token looks like a path, version, number, or URL fragment.""" + return bool(_STRUCTURAL_RE.search(token)) + + +def _sentence_containing(text: str, token: str, max_chars: int = 80) -> str: + """Return a short excerpt of the first sentence containing token (case-insensitive).""" + for sentence in re.split(r"(?<=[.!?])\s+|\n+", text): + if token.lower() in sentence.lower(): + return sentence.strip()[:max_chars] + return "" + + def extract_prompt_text(body: dict, path: str) -> str: - """Extract a flat string from a request body for saliency processing.""" + """ + Extract text from the most recent turns only (last user + last assistant message). + + Scanning full conversation history on every request is noisy and redundant — + the system prompt in particular is large and stable, so its novel words were + already registered on the first request. + """ if path in ("/api/chat", "/v1/chat/completions", "/v1/messages"): + messages = body.get("messages", []) + last_user = next((m for m in reversed(messages) if m.get("role") == "user"), None) + last_assistant = next( + (m for m in reversed(messages) if m.get("role") == "assistant"), None + ) parts: list[str] = [] - # system can be a plain string OR a list of content-block dicts (Anthropic format) - system = body.get("system") - if system: - parts.extend(_extract_text_strings(system)) - for m in body.get("messages", []): + for m in filter(None, [last_user, last_assistant]): parts.extend(_extract_text_strings(m.get("content", ""))) return " ".join(parts) return body.get("prompt", "") @@ -505,8 +540,8 @@ async def process_prompt(body: dict, path: str, pool, cfg: dict) -> dict: if soas_row is None: # Token absent from dictionary → candidate novel domain word. - # Collect for batch processing; apply a per-prompt cap. - if len(novel_this_prompt) < MAX_NOVEL_PER_PROMPT: + # Skip structural tokens (paths, versions, numbers) and apply a per-prompt cap. + if not _is_structural_token(token) and len(novel_this_prompt) < MAX_NOVEL_PER_PROMPT: novel_this_prompt.append(token) continue @@ -530,8 +565,10 @@ async def process_prompt(body: dict, path: str, pool, cfg: dict) -> dict: salient_for_write.append(token) # Create SOAS entries for novel words and add them to the read list. + # Capture first-seen context so zero-hit recollection can include a hint. for token in novel_this_prompt: - soas_row = await create_novel_soas(pool, token) + ctx = _sentence_containing(prompt_text, token) + soas_row = await create_novel_soas(pool, token, context=ctx) salient_for_read.append(soas_row.id) for token in salient_for_write: diff --git a/plugins/festinger/festinger/recollection.py b/plugins/festinger/festinger/recollection.py index 346de13..3e4f366 100644 --- a/plugins/festinger/festinger/recollection.py +++ b/plugins/festinger/festinger/recollection.py @@ -17,7 +17,7 @@ from .cache import SoasRow, UrdEdge log = logging.getLogger("festinger.recollection") -ZERO_HIT_TEMPLATE = "You can't remember what {concept} is. Ask." +ZERO_HIT_TEMPLATE = "You have no memory of {concept}. Try: memory_load query='{concept}' or gutask context {concept}" # --------------------------------------------------------------------------- @@ -53,24 +53,36 @@ def query_edges( # Renderers # --------------------------------------------------------------------------- -def render_hit(concept_token: str, edges: list[UrdEdge], concept_id: int) -> str: +def render_hit( + concept_token: str, + edges: list[UrdEdge], + concept_id: int, + reverse_edges: list[UrdEdge] | None = None, +) -> str: """ Render one concept's recollection line: gnommoweb: [type] repo [membership] glitch_university + omega13: [membership←] gnommoweb (reverse: gnommoweb is a member of omega13) Pending-conflict dimensions get a '?' suffix on the dim token. """ has_pending = concept_id in cache.pending_conflicts parts = [] for e in edges: - dim_label = e.dim_token - if has_pending: - dim_label += "?" + dim_label = e.dim_token + ("?" if has_pending else "") parts.append(f"[{dim_label}] {e.parent_token}") + # Bidirectional: surface concepts where this concept appears as a parent + for e in (reverse_edges or [])[:3]: + child_token = cache.soas_by_id.get(e.concept_id, str(e.concept_id)) + parts.append(f"[{e.dim_token}←] {child_token}") return f"{concept_token}: {' '.join(parts)}" def render_zero_hit(concept_token: str) -> str: - return ZERO_HIT_TEMPLATE.format(concept=concept_token) + line = ZERO_HIT_TEMPLATE.format(concept=concept_token) + soas_row = cache.soas_by_token.get(concept_token) + if soas_row and soas_row.first_seen_context: + line += f" [first seen: '{soas_row.first_seen_context}']" + return line def build_recollection_block( @@ -98,9 +110,11 @@ def build_recollection_block( token = cache.soas_by_id.get(cid, str(cid)) edges = query_edges(cid, confidence_floor, recency_days) + # Bidirectional: find edges where this concept appears as a parent + reverse_edges = cache.urd_by_parent.get(cid, []) - if edges: - hit_lines.append(render_hit(token, edges, cid)) + if edges or reverse_edges: + hit_lines.append(render_hit(token, edges, cid, reverse_edges)) else: zero_lines.append(render_zero_hit(token)) diff --git a/plugins/festinger/festinger/urd_writer.py b/plugins/festinger/festinger/urd_writer.py index 67c4129..b8b0529 100644 --- a/plugins/festinger/festinger/urd_writer.py +++ b/plugins/festinger/festinger/urd_writer.py @@ -132,6 +132,7 @@ async def insert_urd_edge( ) cache.urd_by_concept.setdefault(req.concept_id, []).append(edge) cache.urd_by_concept_dim[key] = edge + cache.urd_by_parent.setdefault(req.parent_id, []).append(edge) log.info( "urd insert concept=%d parent=%d dim=%d is_isa=%s source=%s", req.concept_id, req.parent_id, req.dim_id, req.is_isa, req.source,