This commit is contained in:
2026-04-21 17:52:27 +02:00
parent ce82678d14
commit 2656951bb9
5 changed files with 93 additions and 24 deletions
+46 -9
View File
@@ -20,6 +20,7 @@ from __future__ import annotations
import json
import logging
import os
import re
import time
from contextlib import asynccontextmanager
from pathlib import Path
@@ -435,15 +436,49 @@ def _extract_text_strings(content) -> list[str]:
return []
# Tokens that look structural/technical — skip novel-word detection for these.
# Matches: paths (foo/bar), emails (a@b), file extensions (foo.py), dotted names (1.2.3),
# pure numbers, hex literals/colours.
_STRUCTURAL_RE = re.compile(
r"[/@]" # URL-like separator or email @
r"|\.\w" # dotted extension or namespace (e.g. foo.py, omega.13)
r"|^\d+$" # pure digits
r"|^\d[\d.]+\d$" # version string like 1.2 or 3.4.5
r"|^#[0-9a-f]{3,6}$" # hex colour
r"|^0x[0-9a-f]+$", # hex literal
re.IGNORECASE,
)
def _is_structural_token(token: str) -> bool:
"""Return True if token looks like a path, version, number, or URL fragment."""
return bool(_STRUCTURAL_RE.search(token))
def _sentence_containing(text: str, token: str, max_chars: int = 80) -> str:
"""Return a short excerpt of the first sentence containing token (case-insensitive)."""
for sentence in re.split(r"(?<=[.!?])\s+|\n+", text):
if token.lower() in sentence.lower():
return sentence.strip()[:max_chars]
return ""
def extract_prompt_text(body: dict, path: str) -> str:
"""Extract a flat string from a request body for saliency processing."""
"""
Extract text from the most recent turns only (last user + last assistant message).
Scanning full conversation history on every request is noisy and redundant —
the system prompt in particular is large and stable, so its novel words were
already registered on the first request.
"""
if path in ("/api/chat", "/v1/chat/completions", "/v1/messages"):
messages = body.get("messages", [])
last_user = next((m for m in reversed(messages) if m.get("role") == "user"), None)
last_assistant = next(
(m for m in reversed(messages) if m.get("role") == "assistant"), None
)
parts: list[str] = []
# system can be a plain string OR a list of content-block dicts (Anthropic format)
system = body.get("system")
if system:
parts.extend(_extract_text_strings(system))
for m in body.get("messages", []):
for m in filter(None, [last_user, last_assistant]):
parts.extend(_extract_text_strings(m.get("content", "")))
return " ".join(parts)
return body.get("prompt", "")
@@ -505,8 +540,8 @@ async def process_prompt(body: dict, path: str, pool, cfg: dict) -> dict:
if soas_row is None:
# Token absent from dictionary → candidate novel domain word.
# Collect for batch processing; apply a per-prompt cap.
if len(novel_this_prompt) < MAX_NOVEL_PER_PROMPT:
# Skip structural tokens (paths, versions, numbers) and apply a per-prompt cap.
if not _is_structural_token(token) and len(novel_this_prompt) < MAX_NOVEL_PER_PROMPT:
novel_this_prompt.append(token)
continue
@@ -530,8 +565,10 @@ async def process_prompt(body: dict, path: str, pool, cfg: dict) -> dict:
salient_for_write.append(token)
# Create SOAS entries for novel words and add them to the read list.
# Capture first-seen context so zero-hit recollection can include a hint.
for token in novel_this_prompt:
soas_row = await create_novel_soas(pool, token)
ctx = _sentence_containing(prompt_text, token)
soas_row = await create_novel_soas(pool, token, context=ctx)
salient_for_read.append(soas_row.id)
for token in salient_for_write: