Mirror request model for context discovery — no write_model_id needed
Festinger now extracts provider/model/api-key from every intercepted request and passes it to the context-discover queue as a fallback_model. _process_context_discover uses it when write_model_id is not configured, so Agent0's current model (LM Studio, Ollama, Anthropic) is automatically reused for utility LLM calls without any extra setup. Priority: write_model_id (explicit override) > fallback_model (request mirror) Also updates upstream_openai default in config.yaml to LM Studio's local address (host.docker.internal:1234). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -44,6 +44,7 @@ from .cue_scanner import scan_cues
|
||||
from .recollection import build_recollection_block, inject_recollection
|
||||
from .resolution_job import run_resolution_job, last_run_timestamp
|
||||
from .tokenizer import tokenize
|
||||
from .llm_client import ModelConfig
|
||||
from .write_queue import enqueue_context_discover, enqueue_cue, start_worker, stop_worker
|
||||
from .urd_writer import InsertRequest, insert_urd_edge
|
||||
from .wordnet import import_wordnet, CITATION as WORDNET_CITATION
|
||||
@@ -517,11 +518,67 @@ def inject_recollection_anthropic(body: dict, block: str) -> dict:
|
||||
return body
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Request model mirroring
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _extract_request_model_config(
|
||||
path: str,
|
||||
body: dict,
|
||||
request_headers: dict,
|
||||
cfg: dict,
|
||||
) -> ModelConfig | None:
|
||||
"""
|
||||
Build a ModelConfig from the intercepted request so Festinger's utility
|
||||
LLM calls (context discovery) can use the same provider/model as Agent0 —
|
||||
no separate write_model_id configuration needed.
|
||||
|
||||
Provider inference:
|
||||
/v1/messages → anthropic
|
||||
/v1/chat/completions → lm-studio (OpenAI-compatible; base_url from upstream_openai)
|
||||
/api/chat, /api/generate → lm-studio (Ollama's OpenAI-compat endpoint; base_url from upstream_ollama)
|
||||
"""
|
||||
model_name = body.get("model", "")
|
||||
if not model_name:
|
||||
return None
|
||||
|
||||
if path == "/v1/messages":
|
||||
api_key = request_headers.get("x-api-key", "")
|
||||
return ModelConfig(
|
||||
provider="claude",
|
||||
model_name=model_name,
|
||||
api_key=api_key,
|
||||
)
|
||||
|
||||
if path == "/v1/chat/completions":
|
||||
auth = request_headers.get("authorization", "")
|
||||
api_key = auth[len("Bearer "):].strip() if auth.lower().startswith("bearer ") else auth
|
||||
base_url = cfg.get("upstream_openai", "")
|
||||
return ModelConfig(
|
||||
provider="lm-studio",
|
||||
model_name=model_name,
|
||||
api_key=api_key or "lm-studio",
|
||||
base_url=base_url,
|
||||
)
|
||||
|
||||
if path in ("/api/chat", "/api/generate"):
|
||||
# Ollama exposes an OpenAI-compatible endpoint at the same base URL.
|
||||
base_url = cfg.get("upstream_ollama", "")
|
||||
return ModelConfig(
|
||||
provider="lm-studio",
|
||||
model_name=model_name,
|
||||
api_key="ollama",
|
||||
base_url=base_url,
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Saliency + recollection pipeline
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
async def process_prompt(body: dict, path: str, pool, cfg: dict) -> dict:
|
||||
async def process_prompt(body: dict, path: str, pool, cfg: dict, request_headers: dict | None = None) -> dict:
|
||||
"""
|
||||
Run the saliency + recollection pipeline over the prompt.
|
||||
Returns a (possibly modified) body dict with the recollection block injected.
|
||||
@@ -530,6 +587,10 @@ async def process_prompt(body: dict, path: str, pool, cfg: dict) -> dict:
|
||||
conf_floor = float(await get_config(pool, "recollection_confidence_floor", "0.6"))
|
||||
recency_days = int(await get_config(pool, "recollection_recency_days", "90"))
|
||||
|
||||
# Derive a ModelConfig from the intercepted request so context discovery can
|
||||
# mirror Agent0's current model without a separate write_model_id config.
|
||||
request_model = _extract_request_model_config(path, body, request_headers or {}, cfg)
|
||||
|
||||
# Extract only the last user message — agent responses and reasoning traces
|
||||
# are noise for both cue scanning and concept discovery.
|
||||
user_text = _last_user_message_text(body, path)
|
||||
@@ -590,7 +651,7 @@ async def process_prompt(body: dict, path: str, pool, cfg: dict) -> dict:
|
||||
|
||||
# 3. Enqueue for LLM-driven discovery if there are candidates to evaluate.
|
||||
if novel_candidates and len(user_text) >= 20:
|
||||
await enqueue_context_discover(user_text, novel_candidates)
|
||||
await enqueue_context_discover(user_text, novel_candidates, fallback_model=request_model)
|
||||
|
||||
if not salient_for_read:
|
||||
return body
|
||||
@@ -628,7 +689,7 @@ async def chat(request: Request) -> Response:
|
||||
min_len = cfg["detection"]["min_length"]
|
||||
log.info("chat route=/api/chat model=%s", model)
|
||||
try:
|
||||
body = await process_prompt(body, "/api/chat", pool, cfg)
|
||||
body = await process_prompt(body, "/api/chat", pool, cfg, dict(request.headers))
|
||||
text, raw = await call_ollama("/api/chat", body, upstream)
|
||||
sess = session_key(model, body.get("messages", []))
|
||||
count = record_and_check(sess, text, min_len)
|
||||
@@ -661,7 +722,7 @@ async def generate(request: Request) -> Response:
|
||||
min_len = cfg["detection"]["min_length"]
|
||||
log.info("chat route=/api/generate model=%s", model)
|
||||
try:
|
||||
body = await process_prompt(body, "/api/generate", pool, cfg)
|
||||
body = await process_prompt(body, "/api/generate", pool, cfg, dict(request.headers))
|
||||
messages = [{"role": "user", "content": body.get("prompt", "")}]
|
||||
sess = session_key(model, messages)
|
||||
text, raw = await call_ollama("/api/generate", body, upstream)
|
||||
@@ -709,7 +770,7 @@ async def anthropic_messages(request: Request) -> Response:
|
||||
headers = _relay_headers(request, ANTHROPIC_RELAY_HEADERS)
|
||||
if "anthropic-version" not in {k.lower() for k in headers}:
|
||||
headers["anthropic-version"] = "2023-06-01"
|
||||
body = await process_prompt(body, "/v1/messages", pool, cfg)
|
||||
body = await process_prompt(body, "/v1/messages", pool, cfg, headers)
|
||||
messages = body.get("messages", [])
|
||||
sess = session_key(model, messages)
|
||||
text, raw = await call_anthropic(body, upstream, headers)
|
||||
@@ -761,7 +822,7 @@ async def openai_chat_completions(request: Request) -> Response:
|
||||
log.info("chat route=/v1/chat/completions model=%s upstream=%s", model, upstream)
|
||||
try:
|
||||
headers = _relay_headers(request, OPENAI_RELAY_HEADERS)
|
||||
body = await process_prompt(body, "/v1/chat/completions", pool, cfg)
|
||||
body = await process_prompt(body, "/v1/chat/completions", pool, cfg, headers)
|
||||
messages = body.get("messages", [])
|
||||
sess = session_key(model, messages)
|
||||
text, raw = await call_openai(body, upstream, headers)
|
||||
|
||||
@@ -34,9 +34,14 @@ class ContextDiscoverRequest:
|
||||
|
||||
candidate_tokens are hints for the LLM — it decides which are real domain
|
||||
concepts vs typos/noise, and extracts relationship triples from the text.
|
||||
|
||||
fallback_model: if set, used when write_model_id is not configured in the
|
||||
DB. Allows Festinger to mirror the model Agent0 is currently using without
|
||||
any extra configuration.
|
||||
"""
|
||||
user_text: str
|
||||
candidate_tokens: list[str]
|
||||
fallback_model: Optional[ModelConfig] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -54,12 +59,23 @@ _LLM_CONCURRENCY = 2
|
||||
_llm_semaphore: asyncio.Semaphore | None = None
|
||||
|
||||
|
||||
async def enqueue_context_discover(user_text: str, candidate_tokens: list[str]) -> None:
|
||||
"""Enqueue a user message for LLM-driven concept discovery and relation extraction."""
|
||||
async def enqueue_context_discover(
|
||||
user_text: str,
|
||||
candidate_tokens: list[str],
|
||||
fallback_model: Optional[ModelConfig] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Enqueue a user message for LLM-driven concept discovery and relation extraction.
|
||||
|
||||
fallback_model: ModelConfig derived from the intercepted request. Used when
|
||||
write_model_id is not configured — lets Festinger mirror Agent0's model
|
||||
without any extra setup.
|
||||
"""
|
||||
try:
|
||||
_queue.put_nowait(ContextDiscoverRequest(
|
||||
user_text=user_text,
|
||||
candidate_tokens=candidate_tokens,
|
||||
fallback_model=fallback_model,
|
||||
))
|
||||
except asyncio.QueueFull:
|
||||
log.warning("write queue full — dropping context discover")
|
||||
@@ -103,7 +119,7 @@ async def _worker(pool: asyncpg.Pool) -> None:
|
||||
# Slow path: fire off without awaiting so the worker stays free.
|
||||
asyncio.create_task(
|
||||
_process_context_discover_guarded(
|
||||
pool, item.user_text, item.candidate_tokens
|
||||
pool, item.user_text, item.candidate_tokens, item.fallback_model
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
@@ -116,12 +132,13 @@ async def _process_context_discover_guarded(
|
||||
pool: asyncpg.Pool,
|
||||
user_text: str,
|
||||
candidate_tokens: list[str],
|
||||
fallback_model: Optional[ModelConfig] = None,
|
||||
) -> None:
|
||||
"""Wrapper that acquires the LLM semaphore before concept discovery."""
|
||||
assert _llm_semaphore is not None
|
||||
async with _llm_semaphore:
|
||||
try:
|
||||
await _process_context_discover(pool, user_text, candidate_tokens)
|
||||
await _process_context_discover(pool, user_text, candidate_tokens, fallback_model)
|
||||
except Exception as e:
|
||||
log.exception("context discover task error: %s", e)
|
||||
|
||||
@@ -158,11 +175,17 @@ async def _process_context_discover(
|
||||
pool: asyncpg.Pool,
|
||||
user_text: str,
|
||||
candidate_tokens: list[str],
|
||||
fallback_model: Optional[ModelConfig] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Ask the local LLM to evaluate candidate tokens (dictionary misses) and extract
|
||||
Ask the LLM to evaluate candidate tokens (dictionary misses) and extract
|
||||
relationship triples for those it judges to be real domain concepts.
|
||||
|
||||
Model selection priority:
|
||||
1. write_model_id config key → explicit utility model override (DB models table)
|
||||
2. fallback_model → mirrors the model Agent0 used for this request
|
||||
3. Neither set → skip (log debug)
|
||||
|
||||
Saliency feedback loop:
|
||||
- Confirmed concepts (triples inserted) → saliency raised to NOVEL_CONFIRMED_SALIENCY
|
||||
so they surface as recollection hits on subsequent turns.
|
||||
@@ -170,13 +193,19 @@ async def _process_context_discover(
|
||||
effectively hiding typos and noise from the recollection engine.
|
||||
"""
|
||||
write_model_id = await get_config(pool, "write_model_id")
|
||||
if not write_model_id:
|
||||
log.debug("no write_model_id configured — skipping context discover")
|
||||
return
|
||||
|
||||
model = await get_model_config(pool, write_model_id)
|
||||
if not model:
|
||||
log.warning("write_model_id=%s not found in models table", write_model_id)
|
||||
if write_model_id:
|
||||
model = await get_model_config(pool, write_model_id)
|
||||
if not model:
|
||||
log.warning("write_model_id=%s not found in models table", write_model_id)
|
||||
return
|
||||
elif fallback_model:
|
||||
model = fallback_model
|
||||
log.debug(
|
||||
"context discover: using request model provider=%s model=%s",
|
||||
model.provider, model.model_name,
|
||||
)
|
||||
else:
|
||||
log.debug("no write_model_id configured and no request model available — skipping context discover")
|
||||
return
|
||||
|
||||
seed_dims = ["type", "membership", "runs-on", "tech", "owned-by", "geography"]
|
||||
|
||||
Reference in New Issue
Block a user