Fixes

2026-04-21 17:52:27 +02:00
parent ce82678d14
commit 2656951bb9
5 changed files with 93 additions and 24 deletions
@@ -20,6 +20,7 @@ from __future__ import annotations
 import json
 import logging
 import os
+import re
 import time
 from contextlib import asynccontextmanager
 from pathlib import Path
@@ -435,15 +436,49 @@ def _extract_text_strings(content) -> list[str]:
    return []


+# Tokens that look structural/technical — skip novel-word detection for these.
+# Matches: paths (foo/bar), emails (a@b), file extensions (foo.py), dotted names (1.2.3),
+# pure numbers, hex literals/colours.
+_STRUCTURAL_RE = re.compile(
+    r"[/@]"                  # URL-like separator or email @
+    r"|\.\w"                 # dotted extension or namespace (e.g. foo.py, omega.13)
+    r"|^\d+$"                # pure digits
+    r"|^\d[\d.]+\d$"         # version string like 1.2 or 3.4.5
+    r"|^#[0-9a-f]{3,6}$"    # hex colour
+    r"|^0x[0-9a-f]+$",       # hex literal
+    re.IGNORECASE,
+)
+
+
+def _is_structural_token(token: str) -> bool:
+    """Return True if token looks like a path, version, number, or URL fragment."""
+    return bool(_STRUCTURAL_RE.search(token))
+
+
+def _sentence_containing(text: str, token: str, max_chars: int = 80) -> str:
+    """Return a short excerpt of the first sentence containing token (case-insensitive)."""
+    for sentence in re.split(r"(?<=[.!?])\s+|\n+", text):
+        if token.lower() in sentence.lower():
+            return sentence.strip()[:max_chars]
+    return ""
+
+
 def extract_prompt_text(body: dict, path: str) -> str:
-    """Extract a flat string from a request body for saliency processing."""
+    """
+    Extract text from the most recent turns only (last user + last assistant message).
+
+    Scanning full conversation history on every request is noisy and redundant —
+    the system prompt in particular is large and stable, so its novel words were
+    already registered on the first request.
+    """
    if path in ("/api/chat", "/v1/chat/completions", "/v1/messages"):
+        messages = body.get("messages", [])
+        last_user = next((m for m in reversed(messages) if m.get("role") == "user"), None)
+        last_assistant = next(
+            (m for m in reversed(messages) if m.get("role") == "assistant"), None
+        )
        parts: list[str] = []
-        # system can be a plain string OR a list of content-block dicts (Anthropic format)
-        system = body.get("system")
-        if system:
-            parts.extend(_extract_text_strings(system))
-        for m in body.get("messages", []):
+        for m in filter(None, [last_user, last_assistant]):
            parts.extend(_extract_text_strings(m.get("content", "")))
        return " ".join(parts)
    return body.get("prompt", "")
@@ -505,8 +540,8 @@ async def process_prompt(body: dict, path: str, pool, cfg: dict) -> dict:

        if soas_row is None:
            # Token absent from dictionary → candidate novel domain word.
-            # Collect for batch processing; apply a per-prompt cap.
-            if len(novel_this_prompt) < MAX_NOVEL_PER_PROMPT:
+            # Skip structural tokens (paths, versions, numbers) and apply a per-prompt cap.
+            if not _is_structural_token(token) and len(novel_this_prompt) < MAX_NOVEL_PER_PROMPT:
                novel_this_prompt.append(token)
            continue

@@ -530,8 +565,10 @@ async def process_prompt(body: dict, path: str, pool, cfg: dict) -> dict:
            salient_for_write.append(token)

    # Create SOAS entries for novel words and add them to the read list.
+    # Capture first-seen context so zero-hit recollection can include a hint.
    for token in novel_this_prompt:
-        soas_row = await create_novel_soas(pool, token)
+        ctx = _sentence_containing(prompt_text, token)
+        soas_row = await create_novel_soas(pool, token, context=ctx)
        salient_for_read.append(soas_row.id)

    for token in salient_for_write: