Mirror request model for context discovery — no write_model_id needed

Festinger now extracts provider/model/api-key from every intercepted
request and passes it to the context-discover queue as a fallback_model.
_process_context_discover uses it when write_model_id is not configured,
so Agent0's current model (LM Studio, Ollama, Anthropic) is automatically
reused for utility LLM calls without any extra setup.

Priority: write_model_id (explicit override) > fallback_model (request mirror)

Also updates upstream_openai default in config.yaml to LM Studio's
local address (host.docker.internal:1234).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-21 19:17:51 +02:00
parent ccbb5b2d45
commit 7210fe2066
3 changed files with 111 additions and 20 deletions
+3 -2
View File
@@ -7,9 +7,10 @@ upstream_ollama: "http://host.docker.internal:11434"
# Override via UPSTREAM_ANTHROPIC env var if needed # Override via UPSTREAM_ANTHROPIC env var if needed
upstream_anthropic: "https://api.anthropic.com" upstream_anthropic: "https://api.anthropic.com"
# Where the real OpenAI-compatible API is running (for /v1/chat/completions) # Where the real OpenAI-compatible API is running (for /v1/chat/completions).
# For LM Studio set this to its local address, e.g. "http://host.docker.internal:1234"
# Override via UPSTREAM_OPENAI env var if needed # Override via UPSTREAM_OPENAI env var if needed
upstream_openai: "https://api.openai.com" upstream_openai: "http://host.docker.internal:1234"
# Port this proxy listens on inside the container (exposed as 11434 on the docker network) # Port this proxy listens on inside the container (exposed as 11434 on the docker network)
proxy_port: 11434 proxy_port: 11434
+67 -6
View File
@@ -44,6 +44,7 @@ from .cue_scanner import scan_cues
from .recollection import build_recollection_block, inject_recollection from .recollection import build_recollection_block, inject_recollection
from .resolution_job import run_resolution_job, last_run_timestamp from .resolution_job import run_resolution_job, last_run_timestamp
from .tokenizer import tokenize from .tokenizer import tokenize
from .llm_client import ModelConfig
from .write_queue import enqueue_context_discover, enqueue_cue, start_worker, stop_worker from .write_queue import enqueue_context_discover, enqueue_cue, start_worker, stop_worker
from .urd_writer import InsertRequest, insert_urd_edge from .urd_writer import InsertRequest, insert_urd_edge
from .wordnet import import_wordnet, CITATION as WORDNET_CITATION from .wordnet import import_wordnet, CITATION as WORDNET_CITATION
@@ -517,11 +518,67 @@ def inject_recollection_anthropic(body: dict, block: str) -> dict:
return body return body
# ---------------------------------------------------------------------------
# Request model mirroring
# ---------------------------------------------------------------------------
def _extract_request_model_config(
path: str,
body: dict,
request_headers: dict,
cfg: dict,
) -> ModelConfig | None:
"""
Build a ModelConfig from the intercepted request so Festinger's utility
LLM calls (context discovery) can use the same provider/model as Agent0 —
no separate write_model_id configuration needed.
Provider inference:
/v1/messages → anthropic
/v1/chat/completions → lm-studio (OpenAI-compatible; base_url from upstream_openai)
/api/chat, /api/generate → lm-studio (Ollama's OpenAI-compat endpoint; base_url from upstream_ollama)
"""
model_name = body.get("model", "")
if not model_name:
return None
if path == "/v1/messages":
api_key = request_headers.get("x-api-key", "")
return ModelConfig(
provider="claude",
model_name=model_name,
api_key=api_key,
)
if path == "/v1/chat/completions":
auth = request_headers.get("authorization", "")
api_key = auth[len("Bearer "):].strip() if auth.lower().startswith("bearer ") else auth
base_url = cfg.get("upstream_openai", "")
return ModelConfig(
provider="lm-studio",
model_name=model_name,
api_key=api_key or "lm-studio",
base_url=base_url,
)
if path in ("/api/chat", "/api/generate"):
# Ollama exposes an OpenAI-compatible endpoint at the same base URL.
base_url = cfg.get("upstream_ollama", "")
return ModelConfig(
provider="lm-studio",
model_name=model_name,
api_key="ollama",
base_url=base_url,
)
return None
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Saliency + recollection pipeline # Saliency + recollection pipeline
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
async def process_prompt(body: dict, path: str, pool, cfg: dict) -> dict: async def process_prompt(body: dict, path: str, pool, cfg: dict, request_headers: dict | None = None) -> dict:
""" """
Run the saliency + recollection pipeline over the prompt. Run the saliency + recollection pipeline over the prompt.
Returns a (possibly modified) body dict with the recollection block injected. Returns a (possibly modified) body dict with the recollection block injected.
@@ -530,6 +587,10 @@ async def process_prompt(body: dict, path: str, pool, cfg: dict) -> dict:
conf_floor = float(await get_config(pool, "recollection_confidence_floor", "0.6")) conf_floor = float(await get_config(pool, "recollection_confidence_floor", "0.6"))
recency_days = int(await get_config(pool, "recollection_recency_days", "90")) recency_days = int(await get_config(pool, "recollection_recency_days", "90"))
# Derive a ModelConfig from the intercepted request so context discovery can
# mirror Agent0's current model without a separate write_model_id config.
request_model = _extract_request_model_config(path, body, request_headers or {}, cfg)
# Extract only the last user message — agent responses and reasoning traces # Extract only the last user message — agent responses and reasoning traces
# are noise for both cue scanning and concept discovery. # are noise for both cue scanning and concept discovery.
user_text = _last_user_message_text(body, path) user_text = _last_user_message_text(body, path)
@@ -590,7 +651,7 @@ async def process_prompt(body: dict, path: str, pool, cfg: dict) -> dict:
# 3. Enqueue for LLM-driven discovery if there are candidates to evaluate. # 3. Enqueue for LLM-driven discovery if there are candidates to evaluate.
if novel_candidates and len(user_text) >= 20: if novel_candidates and len(user_text) >= 20:
await enqueue_context_discover(user_text, novel_candidates) await enqueue_context_discover(user_text, novel_candidates, fallback_model=request_model)
if not salient_for_read: if not salient_for_read:
return body return body
@@ -628,7 +689,7 @@ async def chat(request: Request) -> Response:
min_len = cfg["detection"]["min_length"] min_len = cfg["detection"]["min_length"]
log.info("chat route=/api/chat model=%s", model) log.info("chat route=/api/chat model=%s", model)
try: try:
body = await process_prompt(body, "/api/chat", pool, cfg) body = await process_prompt(body, "/api/chat", pool, cfg, dict(request.headers))
text, raw = await call_ollama("/api/chat", body, upstream) text, raw = await call_ollama("/api/chat", body, upstream)
sess = session_key(model, body.get("messages", [])) sess = session_key(model, body.get("messages", []))
count = record_and_check(sess, text, min_len) count = record_and_check(sess, text, min_len)
@@ -661,7 +722,7 @@ async def generate(request: Request) -> Response:
min_len = cfg["detection"]["min_length"] min_len = cfg["detection"]["min_length"]
log.info("chat route=/api/generate model=%s", model) log.info("chat route=/api/generate model=%s", model)
try: try:
body = await process_prompt(body, "/api/generate", pool, cfg) body = await process_prompt(body, "/api/generate", pool, cfg, dict(request.headers))
messages = [{"role": "user", "content": body.get("prompt", "")}] messages = [{"role": "user", "content": body.get("prompt", "")}]
sess = session_key(model, messages) sess = session_key(model, messages)
text, raw = await call_ollama("/api/generate", body, upstream) text, raw = await call_ollama("/api/generate", body, upstream)
@@ -709,7 +770,7 @@ async def anthropic_messages(request: Request) -> Response:
headers = _relay_headers(request, ANTHROPIC_RELAY_HEADERS) headers = _relay_headers(request, ANTHROPIC_RELAY_HEADERS)
if "anthropic-version" not in {k.lower() for k in headers}: if "anthropic-version" not in {k.lower() for k in headers}:
headers["anthropic-version"] = "2023-06-01" headers["anthropic-version"] = "2023-06-01"
body = await process_prompt(body, "/v1/messages", pool, cfg) body = await process_prompt(body, "/v1/messages", pool, cfg, headers)
messages = body.get("messages", []) messages = body.get("messages", [])
sess = session_key(model, messages) sess = session_key(model, messages)
text, raw = await call_anthropic(body, upstream, headers) text, raw = await call_anthropic(body, upstream, headers)
@@ -761,7 +822,7 @@ async def openai_chat_completions(request: Request) -> Response:
log.info("chat route=/v1/chat/completions model=%s upstream=%s", model, upstream) log.info("chat route=/v1/chat/completions model=%s upstream=%s", model, upstream)
try: try:
headers = _relay_headers(request, OPENAI_RELAY_HEADERS) headers = _relay_headers(request, OPENAI_RELAY_HEADERS)
body = await process_prompt(body, "/v1/chat/completions", pool, cfg) body = await process_prompt(body, "/v1/chat/completions", pool, cfg, headers)
messages = body.get("messages", []) messages = body.get("messages", [])
sess = session_key(model, messages) sess = session_key(model, messages)
text, raw = await call_openai(body, upstream, headers) text, raw = await call_openai(body, upstream, headers)
+41 -12
View File
@@ -34,9 +34,14 @@ class ContextDiscoverRequest:
candidate_tokens are hints for the LLM — it decides which are real domain candidate_tokens are hints for the LLM — it decides which are real domain
concepts vs typos/noise, and extracts relationship triples from the text. concepts vs typos/noise, and extracts relationship triples from the text.
fallback_model: if set, used when write_model_id is not configured in the
DB. Allows Festinger to mirror the model Agent0 is currently using without
any extra configuration.
""" """
user_text: str user_text: str
candidate_tokens: list[str] candidate_tokens: list[str]
fallback_model: Optional[ModelConfig] = None
@dataclass @dataclass
@@ -54,12 +59,23 @@ _LLM_CONCURRENCY = 2
_llm_semaphore: asyncio.Semaphore | None = None _llm_semaphore: asyncio.Semaphore | None = None
async def enqueue_context_discover(user_text: str, candidate_tokens: list[str]) -> None: async def enqueue_context_discover(
"""Enqueue a user message for LLM-driven concept discovery and relation extraction.""" user_text: str,
candidate_tokens: list[str],
fallback_model: Optional[ModelConfig] = None,
) -> None:
"""
Enqueue a user message for LLM-driven concept discovery and relation extraction.
fallback_model: ModelConfig derived from the intercepted request. Used when
write_model_id is not configured — lets Festinger mirror Agent0's model
without any extra setup.
"""
try: try:
_queue.put_nowait(ContextDiscoverRequest( _queue.put_nowait(ContextDiscoverRequest(
user_text=user_text, user_text=user_text,
candidate_tokens=candidate_tokens, candidate_tokens=candidate_tokens,
fallback_model=fallback_model,
)) ))
except asyncio.QueueFull: except asyncio.QueueFull:
log.warning("write queue full — dropping context discover") log.warning("write queue full — dropping context discover")
@@ -103,7 +119,7 @@ async def _worker(pool: asyncpg.Pool) -> None:
# Slow path: fire off without awaiting so the worker stays free. # Slow path: fire off without awaiting so the worker stays free.
asyncio.create_task( asyncio.create_task(
_process_context_discover_guarded( _process_context_discover_guarded(
pool, item.user_text, item.candidate_tokens pool, item.user_text, item.candidate_tokens, item.fallback_model
) )
) )
except Exception as e: except Exception as e:
@@ -116,12 +132,13 @@ async def _process_context_discover_guarded(
pool: asyncpg.Pool, pool: asyncpg.Pool,
user_text: str, user_text: str,
candidate_tokens: list[str], candidate_tokens: list[str],
fallback_model: Optional[ModelConfig] = None,
) -> None: ) -> None:
"""Wrapper that acquires the LLM semaphore before concept discovery.""" """Wrapper that acquires the LLM semaphore before concept discovery."""
assert _llm_semaphore is not None assert _llm_semaphore is not None
async with _llm_semaphore: async with _llm_semaphore:
try: try:
await _process_context_discover(pool, user_text, candidate_tokens) await _process_context_discover(pool, user_text, candidate_tokens, fallback_model)
except Exception as e: except Exception as e:
log.exception("context discover task error: %s", e) log.exception("context discover task error: %s", e)
@@ -158,11 +175,17 @@ async def _process_context_discover(
pool: asyncpg.Pool, pool: asyncpg.Pool,
user_text: str, user_text: str,
candidate_tokens: list[str], candidate_tokens: list[str],
fallback_model: Optional[ModelConfig] = None,
) -> None: ) -> None:
""" """
Ask the local LLM to evaluate candidate tokens (dictionary misses) and extract Ask the LLM to evaluate candidate tokens (dictionary misses) and extract
relationship triples for those it judges to be real domain concepts. relationship triples for those it judges to be real domain concepts.
Model selection priority:
1. write_model_id config key → explicit utility model override (DB models table)
2. fallback_model → mirrors the model Agent0 used for this request
3. Neither set → skip (log debug)
Saliency feedback loop: Saliency feedback loop:
- Confirmed concepts (triples inserted) → saliency raised to NOVEL_CONFIRMED_SALIENCY - Confirmed concepts (triples inserted) → saliency raised to NOVEL_CONFIRMED_SALIENCY
so they surface as recollection hits on subsequent turns. so they surface as recollection hits on subsequent turns.
@@ -170,13 +193,19 @@ async def _process_context_discover(
effectively hiding typos and noise from the recollection engine. effectively hiding typos and noise from the recollection engine.
""" """
write_model_id = await get_config(pool, "write_model_id") write_model_id = await get_config(pool, "write_model_id")
if not write_model_id: if write_model_id:
log.debug("no write_model_id configured — skipping context discover") model = await get_model_config(pool, write_model_id)
return if not model:
log.warning("write_model_id=%s not found in models table", write_model_id)
model = await get_model_config(pool, write_model_id) return
if not model: elif fallback_model:
log.warning("write_model_id=%s not found in models table", write_model_id) model = fallback_model
log.debug(
"context discover: using request model provider=%s model=%s",
model.provider, model.model_name,
)
else:
log.debug("no write_model_id configured and no request model available — skipping context discover")
return return
seed_dims = ["type", "membership", "runs-on", "tech", "owned-by", "geography"] seed_dims = ["type", "membership", "runs-on", "tech", "owned-by", "geography"]