From 7210fe2066cd61a9fce31431584f6269e65a7fa9 Mon Sep 17 00:00:00 2001 From: jenstandstad Date: Tue, 21 Apr 2026 19:17:51 +0200 Subject: [PATCH] =?UTF-8?q?Mirror=20request=20model=20for=20context=20disc?= =?UTF-8?q?overy=20=E2=80=94=20no=20write=5Fmodel=5Fid=20needed?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Festinger now extracts provider/model/api-key from every intercepted request and passes it to the context-discover queue as a fallback_model. _process_context_discover uses it when write_model_id is not configured, so Agent0's current model (LM Studio, Ollama, Anthropic) is automatically reused for utility LLM calls without any extra setup. Priority: write_model_id (explicit override) > fallback_model (request mirror) Also updates upstream_openai default in config.yaml to LM Studio's local address (host.docker.internal:1234). Co-Authored-By: Claude Sonnet 4.6 --- plugins/festinger/config.yaml | 5 +- plugins/festinger/festinger/main.py | 73 ++++++++++++++++++++-- plugins/festinger/festinger/write_queue.py | 53 ++++++++++++---- 3 files changed, 111 insertions(+), 20 deletions(-) diff --git a/plugins/festinger/config.yaml b/plugins/festinger/config.yaml index 23e2221..a728739 100644 --- a/plugins/festinger/config.yaml +++ b/plugins/festinger/config.yaml @@ -7,9 +7,10 @@ upstream_ollama: "http://host.docker.internal:11434" # Override via UPSTREAM_ANTHROPIC env var if needed upstream_anthropic: "https://api.anthropic.com" -# Where the real OpenAI-compatible API is running (for /v1/chat/completions) +# Where the real OpenAI-compatible API is running (for /v1/chat/completions). +# For LM Studio set this to its local address, e.g. "http://host.docker.internal:1234" # Override via UPSTREAM_OPENAI env var if needed -upstream_openai: "https://api.openai.com" +upstream_openai: "http://host.docker.internal:1234" # Port this proxy listens on inside the container (exposed as 11434 on the docker network) proxy_port: 11434 diff --git a/plugins/festinger/festinger/main.py b/plugins/festinger/festinger/main.py index b68968c..e8d43e7 100644 --- a/plugins/festinger/festinger/main.py +++ b/plugins/festinger/festinger/main.py @@ -44,6 +44,7 @@ from .cue_scanner import scan_cues from .recollection import build_recollection_block, inject_recollection from .resolution_job import run_resolution_job, last_run_timestamp from .tokenizer import tokenize +from .llm_client import ModelConfig from .write_queue import enqueue_context_discover, enqueue_cue, start_worker, stop_worker from .urd_writer import InsertRequest, insert_urd_edge from .wordnet import import_wordnet, CITATION as WORDNET_CITATION @@ -517,11 +518,67 @@ def inject_recollection_anthropic(body: dict, block: str) -> dict: return body +# --------------------------------------------------------------------------- +# Request model mirroring +# --------------------------------------------------------------------------- + +def _extract_request_model_config( + path: str, + body: dict, + request_headers: dict, + cfg: dict, +) -> ModelConfig | None: + """ + Build a ModelConfig from the intercepted request so Festinger's utility + LLM calls (context discovery) can use the same provider/model as Agent0 — + no separate write_model_id configuration needed. + + Provider inference: + /v1/messages → anthropic + /v1/chat/completions → lm-studio (OpenAI-compatible; base_url from upstream_openai) + /api/chat, /api/generate → lm-studio (Ollama's OpenAI-compat endpoint; base_url from upstream_ollama) + """ + model_name = body.get("model", "") + if not model_name: + return None + + if path == "/v1/messages": + api_key = request_headers.get("x-api-key", "") + return ModelConfig( + provider="claude", + model_name=model_name, + api_key=api_key, + ) + + if path == "/v1/chat/completions": + auth = request_headers.get("authorization", "") + api_key = auth[len("Bearer "):].strip() if auth.lower().startswith("bearer ") else auth + base_url = cfg.get("upstream_openai", "") + return ModelConfig( + provider="lm-studio", + model_name=model_name, + api_key=api_key or "lm-studio", + base_url=base_url, + ) + + if path in ("/api/chat", "/api/generate"): + # Ollama exposes an OpenAI-compatible endpoint at the same base URL. + base_url = cfg.get("upstream_ollama", "") + return ModelConfig( + provider="lm-studio", + model_name=model_name, + api_key="ollama", + base_url=base_url, + ) + + return None + + # --------------------------------------------------------------------------- # Saliency + recollection pipeline # --------------------------------------------------------------------------- -async def process_prompt(body: dict, path: str, pool, cfg: dict) -> dict: +async def process_prompt(body: dict, path: str, pool, cfg: dict, request_headers: dict | None = None) -> dict: """ Run the saliency + recollection pipeline over the prompt. Returns a (possibly modified) body dict with the recollection block injected. @@ -530,6 +587,10 @@ async def process_prompt(body: dict, path: str, pool, cfg: dict) -> dict: conf_floor = float(await get_config(pool, "recollection_confidence_floor", "0.6")) recency_days = int(await get_config(pool, "recollection_recency_days", "90")) + # Derive a ModelConfig from the intercepted request so context discovery can + # mirror Agent0's current model without a separate write_model_id config. + request_model = _extract_request_model_config(path, body, request_headers or {}, cfg) + # Extract only the last user message — agent responses and reasoning traces # are noise for both cue scanning and concept discovery. user_text = _last_user_message_text(body, path) @@ -590,7 +651,7 @@ async def process_prompt(body: dict, path: str, pool, cfg: dict) -> dict: # 3. Enqueue for LLM-driven discovery if there are candidates to evaluate. if novel_candidates and len(user_text) >= 20: - await enqueue_context_discover(user_text, novel_candidates) + await enqueue_context_discover(user_text, novel_candidates, fallback_model=request_model) if not salient_for_read: return body @@ -628,7 +689,7 @@ async def chat(request: Request) -> Response: min_len = cfg["detection"]["min_length"] log.info("chat route=/api/chat model=%s", model) try: - body = await process_prompt(body, "/api/chat", pool, cfg) + body = await process_prompt(body, "/api/chat", pool, cfg, dict(request.headers)) text, raw = await call_ollama("/api/chat", body, upstream) sess = session_key(model, body.get("messages", [])) count = record_and_check(sess, text, min_len) @@ -661,7 +722,7 @@ async def generate(request: Request) -> Response: min_len = cfg["detection"]["min_length"] log.info("chat route=/api/generate model=%s", model) try: - body = await process_prompt(body, "/api/generate", pool, cfg) + body = await process_prompt(body, "/api/generate", pool, cfg, dict(request.headers)) messages = [{"role": "user", "content": body.get("prompt", "")}] sess = session_key(model, messages) text, raw = await call_ollama("/api/generate", body, upstream) @@ -709,7 +770,7 @@ async def anthropic_messages(request: Request) -> Response: headers = _relay_headers(request, ANTHROPIC_RELAY_HEADERS) if "anthropic-version" not in {k.lower() for k in headers}: headers["anthropic-version"] = "2023-06-01" - body = await process_prompt(body, "/v1/messages", pool, cfg) + body = await process_prompt(body, "/v1/messages", pool, cfg, headers) messages = body.get("messages", []) sess = session_key(model, messages) text, raw = await call_anthropic(body, upstream, headers) @@ -761,7 +822,7 @@ async def openai_chat_completions(request: Request) -> Response: log.info("chat route=/v1/chat/completions model=%s upstream=%s", model, upstream) try: headers = _relay_headers(request, OPENAI_RELAY_HEADERS) - body = await process_prompt(body, "/v1/chat/completions", pool, cfg) + body = await process_prompt(body, "/v1/chat/completions", pool, cfg, headers) messages = body.get("messages", []) sess = session_key(model, messages) text, raw = await call_openai(body, upstream, headers) diff --git a/plugins/festinger/festinger/write_queue.py b/plugins/festinger/festinger/write_queue.py index 76aba60..24f0e8a 100644 --- a/plugins/festinger/festinger/write_queue.py +++ b/plugins/festinger/festinger/write_queue.py @@ -34,9 +34,14 @@ class ContextDiscoverRequest: candidate_tokens are hints for the LLM — it decides which are real domain concepts vs typos/noise, and extracts relationship triples from the text. + + fallback_model: if set, used when write_model_id is not configured in the + DB. Allows Festinger to mirror the model Agent0 is currently using without + any extra configuration. """ user_text: str candidate_tokens: list[str] + fallback_model: Optional[ModelConfig] = None @dataclass @@ -54,12 +59,23 @@ _LLM_CONCURRENCY = 2 _llm_semaphore: asyncio.Semaphore | None = None -async def enqueue_context_discover(user_text: str, candidate_tokens: list[str]) -> None: - """Enqueue a user message for LLM-driven concept discovery and relation extraction.""" +async def enqueue_context_discover( + user_text: str, + candidate_tokens: list[str], + fallback_model: Optional[ModelConfig] = None, +) -> None: + """ + Enqueue a user message for LLM-driven concept discovery and relation extraction. + + fallback_model: ModelConfig derived from the intercepted request. Used when + write_model_id is not configured — lets Festinger mirror Agent0's model + without any extra setup. + """ try: _queue.put_nowait(ContextDiscoverRequest( user_text=user_text, candidate_tokens=candidate_tokens, + fallback_model=fallback_model, )) except asyncio.QueueFull: log.warning("write queue full — dropping context discover") @@ -103,7 +119,7 @@ async def _worker(pool: asyncpg.Pool) -> None: # Slow path: fire off without awaiting so the worker stays free. asyncio.create_task( _process_context_discover_guarded( - pool, item.user_text, item.candidate_tokens + pool, item.user_text, item.candidate_tokens, item.fallback_model ) ) except Exception as e: @@ -116,12 +132,13 @@ async def _process_context_discover_guarded( pool: asyncpg.Pool, user_text: str, candidate_tokens: list[str], + fallback_model: Optional[ModelConfig] = None, ) -> None: """Wrapper that acquires the LLM semaphore before concept discovery.""" assert _llm_semaphore is not None async with _llm_semaphore: try: - await _process_context_discover(pool, user_text, candidate_tokens) + await _process_context_discover(pool, user_text, candidate_tokens, fallback_model) except Exception as e: log.exception("context discover task error: %s", e) @@ -158,11 +175,17 @@ async def _process_context_discover( pool: asyncpg.Pool, user_text: str, candidate_tokens: list[str], + fallback_model: Optional[ModelConfig] = None, ) -> None: """ - Ask the local LLM to evaluate candidate tokens (dictionary misses) and extract + Ask the LLM to evaluate candidate tokens (dictionary misses) and extract relationship triples for those it judges to be real domain concepts. + Model selection priority: + 1. write_model_id config key → explicit utility model override (DB models table) + 2. fallback_model → mirrors the model Agent0 used for this request + 3. Neither set → skip (log debug) + Saliency feedback loop: - Confirmed concepts (triples inserted) → saliency raised to NOVEL_CONFIRMED_SALIENCY so they surface as recollection hits on subsequent turns. @@ -170,13 +193,19 @@ async def _process_context_discover( effectively hiding typos and noise from the recollection engine. """ write_model_id = await get_config(pool, "write_model_id") - if not write_model_id: - log.debug("no write_model_id configured — skipping context discover") - return - - model = await get_model_config(pool, write_model_id) - if not model: - log.warning("write_model_id=%s not found in models table", write_model_id) + if write_model_id: + model = await get_model_config(pool, write_model_id) + if not model: + log.warning("write_model_id=%s not found in models table", write_model_id) + return + elif fallback_model: + model = fallback_model + log.debug( + "context discover: using request model provider=%s model=%s", + model.provider, model.model_name, + ) + else: + log.debug("no write_model_id configured and no request model available — skipping context discover") return seed_dims = ["type", "membership", "runs-on", "tech", "owned-by", "geography"]