From 7210fe2066cd61a9fce31431584f6269e65a7fa9 Mon Sep 17 00:00:00 2001
From: jenstandstad <jens.tandstad@gmail.com>
Date: Tue, 21 Apr 2026 19:17:51 +0200
Subject: [PATCH] =?UTF-8?q?Mirror=20request=20model=20for=20context=20disc?=
 =?UTF-8?q?overy=20=E2=80=94=20no=20write=5Fmodel=5Fid=20needed?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Festinger now extracts provider/model/api-key from every intercepted
request and passes it to the context-discover queue as a fallback_model.
_process_context_discover uses it when write_model_id is not configured,
so Agent0's current model (LM Studio, Ollama, Anthropic) is automatically
reused for utility LLM calls without any extra setup.

Priority: write_model_id (explicit override) > fallback_model (request mirror)

Also updates upstream_openai default in config.yaml to LM Studio's
local address (host.docker.internal:1234).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 plugins/festinger/config.yaml              |  5 +-
 plugins/festinger/festinger/main.py        | 73 ++++++++++++++++++++--
 plugins/festinger/festinger/write_queue.py | 53 ++++++++++++----
 3 files changed, 111 insertions(+), 20 deletions(-)

diff --git a/plugins/festinger/config.yaml b/plugins/festinger/config.yaml
index 23e2221..a728739 100644
--- a/plugins/festinger/config.yaml
+++ b/plugins/festinger/config.yaml
@@ -7,9 +7,10 @@ upstream_ollama: "http://host.docker.internal:11434"
 # Override via UPSTREAM_ANTHROPIC env var if needed
 upstream_anthropic: "https://api.anthropic.com"
 
-# Where the real OpenAI-compatible API is running (for /v1/chat/completions)
+# Where the real OpenAI-compatible API is running (for /v1/chat/completions).
+# For LM Studio set this to its local address, e.g. "http://host.docker.internal:1234"
 # Override via UPSTREAM_OPENAI env var if needed
-upstream_openai: "https://api.openai.com"
+upstream_openai: "http://host.docker.internal:1234"
 
 # Port this proxy listens on inside the container (exposed as 11434 on the docker network)
 proxy_port: 11434
diff --git a/plugins/festinger/festinger/main.py b/plugins/festinger/festinger/main.py
index b68968c..e8d43e7 100644
--- a/plugins/festinger/festinger/main.py
+++ b/plugins/festinger/festinger/main.py
@@ -44,6 +44,7 @@ from .cue_scanner import scan_cues
 from .recollection import build_recollection_block, inject_recollection
 from .resolution_job import run_resolution_job, last_run_timestamp
 from .tokenizer import tokenize
+from .llm_client import ModelConfig
 from .write_queue import enqueue_context_discover, enqueue_cue, start_worker, stop_worker
 from .urd_writer import InsertRequest, insert_urd_edge
 from .wordnet import import_wordnet, CITATION as WORDNET_CITATION
@@ -517,11 +518,67 @@ def inject_recollection_anthropic(body: dict, block: str) -> dict:
     return body
 
 
+# ---------------------------------------------------------------------------
+# Request model mirroring
+# ---------------------------------------------------------------------------
+
+def _extract_request_model_config(
+    path: str,
+    body: dict,
+    request_headers: dict,
+    cfg: dict,
+) -> ModelConfig | None:
+    """
+    Build a ModelConfig from the intercepted request so Festinger's utility
+    LLM calls (context discovery) can use the same provider/model as Agent0 —
+    no separate write_model_id configuration needed.
+
+    Provider inference:
+      /v1/messages            → anthropic
+      /v1/chat/completions    → lm-studio (OpenAI-compatible; base_url from upstream_openai)
+      /api/chat, /api/generate → lm-studio (Ollama's OpenAI-compat endpoint; base_url from upstream_ollama)
+    """
+    model_name = body.get("model", "")
+    if not model_name:
+        return None
+
+    if path == "/v1/messages":
+        api_key = request_headers.get("x-api-key", "")
+        return ModelConfig(
+            provider="claude",
+            model_name=model_name,
+            api_key=api_key,
+        )
+
+    if path == "/v1/chat/completions":
+        auth = request_headers.get("authorization", "")
+        api_key = auth[len("Bearer "):].strip() if auth.lower().startswith("bearer ") else auth
+        base_url = cfg.get("upstream_openai", "")
+        return ModelConfig(
+            provider="lm-studio",
+            model_name=model_name,
+            api_key=api_key or "lm-studio",
+            base_url=base_url,
+        )
+
+    if path in ("/api/chat", "/api/generate"):
+        # Ollama exposes an OpenAI-compatible endpoint at the same base URL.
+        base_url = cfg.get("upstream_ollama", "")
+        return ModelConfig(
+            provider="lm-studio",
+            model_name=model_name,
+            api_key="ollama",
+            base_url=base_url,
+        )
+
+    return None
+
+
 # ---------------------------------------------------------------------------
 # Saliency + recollection pipeline
 # ---------------------------------------------------------------------------
 
-async def process_prompt(body: dict, path: str, pool, cfg: dict) -> dict:
+async def process_prompt(body: dict, path: str, pool, cfg: dict, request_headers: dict | None = None) -> dict:
     """
     Run the saliency + recollection pipeline over the prompt.
     Returns a (possibly modified) body dict with the recollection block injected.
@@ -530,6 +587,10 @@ async def process_prompt(body: dict, path: str, pool, cfg: dict) -> dict:
     conf_floor = float(await get_config(pool, "recollection_confidence_floor", "0.6"))
     recency_days = int(await get_config(pool, "recollection_recency_days", "90"))
 
+    # Derive a ModelConfig from the intercepted request so context discovery can
+    # mirror Agent0's current model without a separate write_model_id config.
+    request_model = _extract_request_model_config(path, body, request_headers or {}, cfg)
+
     # Extract only the last user message — agent responses and reasoning traces
     # are noise for both cue scanning and concept discovery.
     user_text = _last_user_message_text(body, path)
@@ -590,7 +651,7 @@ async def process_prompt(body: dict, path: str, pool, cfg: dict) -> dict:
 
     # 3. Enqueue for LLM-driven discovery if there are candidates to evaluate.
     if novel_candidates and len(user_text) >= 20:
-        await enqueue_context_discover(user_text, novel_candidates)
+        await enqueue_context_discover(user_text, novel_candidates, fallback_model=request_model)
 
     if not salient_for_read:
         return body
@@ -628,7 +689,7 @@ async def chat(request: Request) -> Response:
     min_len = cfg["detection"]["min_length"]
     log.info("chat  route=/api/chat model=%s", model)
     try:
-        body = await process_prompt(body, "/api/chat", pool, cfg)
+        body = await process_prompt(body, "/api/chat", pool, cfg, dict(request.headers))
         text, raw = await call_ollama("/api/chat", body, upstream)
         sess = session_key(model, body.get("messages", []))
         count = record_and_check(sess, text, min_len)
@@ -661,7 +722,7 @@ async def generate(request: Request) -> Response:
     min_len = cfg["detection"]["min_length"]
     log.info("chat  route=/api/generate model=%s", model)
     try:
-        body = await process_prompt(body, "/api/generate", pool, cfg)
+        body = await process_prompt(body, "/api/generate", pool, cfg, dict(request.headers))
         messages = [{"role": "user", "content": body.get("prompt", "")}]
         sess = session_key(model, messages)
         text, raw = await call_ollama("/api/generate", body, upstream)
@@ -709,7 +770,7 @@ async def anthropic_messages(request: Request) -> Response:
         headers = _relay_headers(request, ANTHROPIC_RELAY_HEADERS)
         if "anthropic-version" not in {k.lower() for k in headers}:
             headers["anthropic-version"] = "2023-06-01"
-        body = await process_prompt(body, "/v1/messages", pool, cfg)
+        body = await process_prompt(body, "/v1/messages", pool, cfg, headers)
         messages = body.get("messages", [])
         sess = session_key(model, messages)
         text, raw = await call_anthropic(body, upstream, headers)
@@ -761,7 +822,7 @@ async def openai_chat_completions(request: Request) -> Response:
     log.info("chat  route=/v1/chat/completions model=%s upstream=%s", model, upstream)
     try:
         headers = _relay_headers(request, OPENAI_RELAY_HEADERS)
-        body = await process_prompt(body, "/v1/chat/completions", pool, cfg)
+        body = await process_prompt(body, "/v1/chat/completions", pool, cfg, headers)
         messages = body.get("messages", [])
         sess = session_key(model, messages)
         text, raw = await call_openai(body, upstream, headers)
diff --git a/plugins/festinger/festinger/write_queue.py b/plugins/festinger/festinger/write_queue.py
index 76aba60..24f0e8a 100644
--- a/plugins/festinger/festinger/write_queue.py
+++ b/plugins/festinger/festinger/write_queue.py
@@ -34,9 +34,14 @@ class ContextDiscoverRequest:
 
     candidate_tokens are hints for the LLM — it decides which are real domain
     concepts vs typos/noise, and extracts relationship triples from the text.
+
+    fallback_model: if set, used when write_model_id is not configured in the
+    DB.  Allows Festinger to mirror the model Agent0 is currently using without
+    any extra configuration.
     """
     user_text: str
     candidate_tokens: list[str]
+    fallback_model: Optional[ModelConfig] = None
 
 
 @dataclass
@@ -54,12 +59,23 @@ _LLM_CONCURRENCY = 2
 _llm_semaphore: asyncio.Semaphore | None = None
 
 
-async def enqueue_context_discover(user_text: str, candidate_tokens: list[str]) -> None:
-    """Enqueue a user message for LLM-driven concept discovery and relation extraction."""
+async def enqueue_context_discover(
+    user_text: str,
+    candidate_tokens: list[str],
+    fallback_model: Optional[ModelConfig] = None,
+) -> None:
+    """
+    Enqueue a user message for LLM-driven concept discovery and relation extraction.
+
+    fallback_model: ModelConfig derived from the intercepted request.  Used when
+    write_model_id is not configured — lets Festinger mirror Agent0's model
+    without any extra setup.
+    """
     try:
         _queue.put_nowait(ContextDiscoverRequest(
             user_text=user_text,
             candidate_tokens=candidate_tokens,
+            fallback_model=fallback_model,
         ))
     except asyncio.QueueFull:
         log.warning("write queue full — dropping context discover")
@@ -103,7 +119,7 @@ async def _worker(pool: asyncpg.Pool) -> None:
                 # Slow path: fire off without awaiting so the worker stays free.
                 asyncio.create_task(
                     _process_context_discover_guarded(
-                        pool, item.user_text, item.candidate_tokens
+                        pool, item.user_text, item.candidate_tokens, item.fallback_model
                     )
                 )
         except Exception as e:
@@ -116,12 +132,13 @@ async def _process_context_discover_guarded(
     pool: asyncpg.Pool,
     user_text: str,
     candidate_tokens: list[str],
+    fallback_model: Optional[ModelConfig] = None,
 ) -> None:
     """Wrapper that acquires the LLM semaphore before concept discovery."""
     assert _llm_semaphore is not None
     async with _llm_semaphore:
         try:
-            await _process_context_discover(pool, user_text, candidate_tokens)
+            await _process_context_discover(pool, user_text, candidate_tokens, fallback_model)
         except Exception as e:
             log.exception("context discover task error: %s", e)
 
@@ -158,11 +175,17 @@ async def _process_context_discover(
     pool: asyncpg.Pool,
     user_text: str,
     candidate_tokens: list[str],
+    fallback_model: Optional[ModelConfig] = None,
 ) -> None:
     """
-    Ask the local LLM to evaluate candidate tokens (dictionary misses) and extract
+    Ask the LLM to evaluate candidate tokens (dictionary misses) and extract
     relationship triples for those it judges to be real domain concepts.
 
+    Model selection priority:
+      1. write_model_id config key → explicit utility model override (DB models table)
+      2. fallback_model              → mirrors the model Agent0 used for this request
+      3. Neither set                 → skip (log debug)
+
     Saliency feedback loop:
       - Confirmed concepts (triples inserted) → saliency raised to NOVEL_CONFIRMED_SALIENCY
         so they surface as recollection hits on subsequent turns.
@@ -170,13 +193,19 @@ async def _process_context_discover(
         effectively hiding typos and noise from the recollection engine.
     """
     write_model_id = await get_config(pool, "write_model_id")
-    if not write_model_id:
-        log.debug("no write_model_id configured — skipping context discover")
-        return
-
-    model = await get_model_config(pool, write_model_id)
-    if not model:
-        log.warning("write_model_id=%s not found in models table", write_model_id)
+    if write_model_id:
+        model = await get_model_config(pool, write_model_id)
+        if not model:
+            log.warning("write_model_id=%s not found in models table", write_model_id)
+            return
+    elif fallback_model:
+        model = fallback_model
+        log.debug(
+            "context discover: using request model provider=%s model=%s",
+            model.provider, model.model_name,
+        )
+    else:
+        log.debug("no write_model_id configured and no request model available — skipping context discover")
         return
 
     seed_dims = ["type", "membership", "runs-on", "tech", "owned-by", "geography"]