bb2a4e2293
Leading/trailing spaces in the endpoint field cause httpx to receive a URL without a recognisable scheme, producing UnsupportedProtocol. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
693 lines
29 KiB
Python
693 lines
29 KiB
Python
#!/usr/bin/env python3
|
|
"""Agent Inference Service
|
|
|
|
A generic LLM inference service that can serve any configured agent.
|
|
Receives the full agent profile (identity, knowledge, guardrails) and model
|
|
configuration in each request.
|
|
|
|
Three connection modes are supported, selected by agent_type (primary) or
|
|
the endpoint's type field:
|
|
|
|
agent_type = "standard" / type = anthropic|openai|ollama|lm_studio
|
|
Assembles a full system prompt from the agent profile and calls the model
|
|
via litellm. The LLM must respond with JSON {"message", "pose"}.
|
|
|
|
agent_type = "agent0" / type = "agent0"
|
|
Connects to a live Agent Zero instance via its MCP server (streamable-http).
|
|
Calls the built-in send_message tool, which has full tool use, memory, and
|
|
persistent context. Pose is chosen heuristically from the plain-text reply.
|
|
MCP URL: {endpoint}/mcp/t-{mcp_key}/http
|
|
|
|
agent_type = "hermes"
|
|
Connects to a Hermes Agent dashboard via JSON-RPC 2.0 WebSocket.
|
|
Fetches the ephemeral session token from the dashboard HTML automatically.
|
|
WebSocket: {endpoint}/api/ws?token={token}
|
|
Session: session.create → prompt.submit → stream message.delta events.
|
|
|
|
Nothing is summarised or truncated by this service. Context-window management
|
|
for LLM calls is the responsibility of the caller (gnommoweb).
|
|
"""
|
|
|
|
import asyncio
|
|
import os
|
|
import json
|
|
import re
|
|
import time
|
|
import logging
|
|
from fastapi import FastAPI, HTTPException, Header
|
|
from pydantic import BaseModel, Field
|
|
from typing import Optional
|
|
import httpx
|
|
import litellm
|
|
|
|
# --- Config ---
|
|
API_KEY = os.getenv("AGENT_INFERENCE_KEY", "agent-inference-dev-key")
|
|
AGENT_MAX_MESSAGES = int(os.getenv("AGENT_INFERENCE_MAX_MESSAGES", "10"))
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
log = logging.getLogger("agent-inference")
|
|
|
|
# ── In-memory Agent0 session map ──────────────────────────────────────────────
|
|
# Maps gnommoweb conversation_id → Agent Zero chat_id (context id).
|
|
# Allows conversation continuity across turns for Agent0 MCP connections.
|
|
# Cleared on service restart — acceptable for current single-user usage.
|
|
_a0_sessions: dict[int, str] = {}
|
|
|
|
# ── In-memory Hermes session map ───────────────────────────────────────────────
|
|
# Maps gnommoweb conversation_id → Hermes session_id.
|
|
# Also caches the dashboard token per endpoint (re-fetched on 401).
|
|
_hermes_sessions: dict[int, str] = {}
|
|
_hermes_token_cache: dict[str, str] = {} # endpoint → session token
|
|
|
|
# ── Request / Response models ─────────────────────────────────────────────────
|
|
|
|
class ModelConfig(BaseModel):
|
|
name: Optional[str] = None
|
|
model_id: str = "" # litellm model string; label-only for Agent0 MCP
|
|
endpoint: Optional[str] = None # base URL; api_base for litellm, or Agent0 host
|
|
api_key: Optional[str] = None # LLM API key; not used for Agent0 MCP
|
|
type: Optional[str] = None # anthropic | openai | ollama | lm_studio | hermes | agent0
|
|
mcp_key: Optional[str] = None # Agent0 MCP token
|
|
|
|
class ModelsPayload(BaseModel):
|
|
heaviness: int = 1 # 1=light, 2=medium, 3=heavy
|
|
light: Optional[ModelConfig] = None
|
|
medium: Optional[ModelConfig] = None
|
|
heavy: Optional[ModelConfig] = None
|
|
|
|
class AgentProfile(BaseModel):
|
|
"""Full agent profile received from gnommoweb. All fields sent verbatim."""
|
|
name: str
|
|
role: str = ""
|
|
from_name: str = ""
|
|
poses: list[str] = Field(default_factory=list)
|
|
identity_document: str = ""
|
|
job_description: str = ""
|
|
guardrails: str = ""
|
|
best_practices: str = ""
|
|
knowledge_base: str = ""
|
|
mock_responses: list[str] = Field(default_factory=list)
|
|
agent_type: str = "standard" # standard | agent0 | hermes
|
|
|
|
class ChatRequest(BaseModel):
|
|
user_id: int
|
|
agent_id: int = 1
|
|
message: str
|
|
conversation_id: Optional[int] = None
|
|
history: list = Field(default_factory=list)
|
|
agent: Optional[AgentProfile] = None
|
|
models: Optional[ModelsPayload] = None
|
|
|
|
class ChatResponse(BaseModel):
|
|
letter_id: int = 0
|
|
timestamp: int
|
|
message: str
|
|
pose: str
|
|
conversation_id: Optional[int] = None
|
|
actions: Optional[list] = None
|
|
|
|
# ── System prompt (LLM path only) ─────────────────────────────────────────────
|
|
|
|
def build_system_prompt(agent: AgentProfile, message_count: int = 0) -> str:
|
|
"""
|
|
Assemble the complete system prompt for one LLM call.
|
|
|
|
Sections (all verbatim from the agent's DB fields):
|
|
1. Identity anchor — always present; name + role + from_name
|
|
2. Identity document — full persona, backstory, values (if set)
|
|
3. Job description — responsibilities and objectives (if set)
|
|
4. Guardrails — hard constraints (if set)
|
|
5. Best practices — response style and soft rules (if set)
|
|
6. Knowledge base — factual grounding (if set)
|
|
7. Poses — complete list of valid pose identifiers
|
|
8. Response format — JSON output structure
|
|
9. Wrap-up hint — injected near the conversation limit
|
|
"""
|
|
parts = []
|
|
|
|
# ── 1. Identity anchor (always present) ──────────────────────────────────
|
|
anchor_lines = [f"You are {agent.name}"]
|
|
if agent.role:
|
|
anchor_lines[0] += f", {agent.role}"
|
|
if agent.from_name and agent.from_name not in (agent.name, f"{agent.name}, {agent.role}"):
|
|
anchor_lines.append(f"You are known as: {agent.from_name}")
|
|
anchor_lines.append(
|
|
"You MUST maintain this character and persona throughout the entire conversation. "
|
|
"Do NOT break character. Do NOT acknowledge being an AI or a language model."
|
|
)
|
|
parts.append("\n".join(anchor_lines))
|
|
|
|
# ── 2. Identity document (verbatim) ──────────────────────────────────────
|
|
if agent.identity_document:
|
|
parts.append(f"## IDENTITY\n\n{agent.identity_document}")
|
|
|
|
# ── 3. Job description (verbatim) ────────────────────────────────────────
|
|
if agent.job_description:
|
|
parts.append(f"## YOUR ROLE\n\n{agent.job_description}")
|
|
|
|
# ── 4. Guardrails (verbatim) ─────────────────────────────────────────────
|
|
if agent.guardrails:
|
|
parts.append(f"## GUARDRAILS — FOLLOW EXACTLY\n\n{agent.guardrails}")
|
|
|
|
# ── 5. Best practices (verbatim) ─────────────────────────────────────────
|
|
if agent.best_practices:
|
|
parts.append(f"## BEST PRACTICES\n\n{agent.best_practices}")
|
|
|
|
# ── 6. Knowledge base (verbatim) ─────────────────────────────────────────
|
|
if agent.knowledge_base:
|
|
parts.append(f"## KNOWLEDGE BASE\n\n{agent.knowledge_base}")
|
|
|
|
# ── 7. Poses ──────────────────────────────────────────────────────────────
|
|
valid_poses = agent.poses if agent.poses else ["neutral"]
|
|
pose_lines = [
|
|
"## POSES",
|
|
"",
|
|
"Every response MUST include a pose field set to one of these exact identifiers:",
|
|
"",
|
|
]
|
|
for p in valid_poses:
|
|
pose_lines.append(f" {p}")
|
|
pose_lines += [
|
|
"",
|
|
"Choose the pose that best reflects your character's genuine emotional state in this moment.",
|
|
"Pose names are descriptive — use them accordingly.",
|
|
"Any identifier not in the list above will be rejected.",
|
|
]
|
|
parts.append("\n".join(pose_lines))
|
|
|
|
# ── 8. Response format ────────────────────────────────────────────────────
|
|
parts.append(
|
|
"## RESPONSE FORMAT\n\n"
|
|
"Respond ONLY with a valid JSON object. No prose before or after it.\n\n"
|
|
"Required fields:\n"
|
|
' "message" — your in-character response text\n'
|
|
' "pose" — one identifier from the POSES list above\n\n'
|
|
"Optional field:\n"
|
|
' "actions" — array of action buttons to show the user, e.g.:\n'
|
|
' [{"emoji": "🫳🏽", "label": "Do"}, {"emoji": "👁️", "label": "Look"}]\n'
|
|
" Omit entirely to keep the current action buttons unchanged.\n"
|
|
" Maximum 4 actions. The Leave action is always implicit.\n\n"
|
|
"Example:\n"
|
|
'{\n'
|
|
' "message": "Your response here.",\n'
|
|
' "pose": "neutral"\n'
|
|
'}'
|
|
)
|
|
|
|
# ── 9. Wrap-up hint (injected near the conversation limit) ────────────────
|
|
wrap_up_threshold = max(AGENT_MAX_MESSAGES - 3, 4)
|
|
if message_count >= AGENT_MAX_MESSAGES:
|
|
parts.append(
|
|
"## END THIS CONVERSATION NOW\n\n"
|
|
"This conversation has reached its limit. You MUST wrap up in this response. "
|
|
"Thank the user, guide them toward whatever next step suits your role, "
|
|
"and set your pose to 'closed'. This is your final message."
|
|
)
|
|
elif message_count >= wrap_up_threshold:
|
|
parts.append(
|
|
"NOTE: This conversation is approaching its limit. "
|
|
"Begin guiding the user toward a natural conclusion."
|
|
)
|
|
|
|
return "\n\n".join(parts)
|
|
|
|
# ── Model selection ───────────────────────────────────────────────────────────
|
|
|
|
def select_model(payload: Optional[ModelsPayload]) -> Optional[ModelConfig]:
|
|
"""
|
|
Pick the ModelConfig based on requested heaviness.
|
|
Fallback chain: requested heaviness → lower levels → None (use env fallback).
|
|
"""
|
|
if not payload:
|
|
return None
|
|
|
|
heaviness = payload.heaviness
|
|
candidates = []
|
|
if heaviness >= 3 and payload.heavy:
|
|
candidates.append(payload.heavy)
|
|
if heaviness >= 2 and payload.medium:
|
|
candidates.append(payload.medium)
|
|
if payload.light:
|
|
candidates.append(payload.light)
|
|
|
|
chosen = candidates[0] if candidates else None
|
|
if not chosen:
|
|
log.warning(f"No model configured for heaviness={heaviness}, falling back to env model")
|
|
return chosen
|
|
|
|
# ── Pose heuristic (Agent0 MCP path) ─────────────────────────────────────────
|
|
|
|
def pick_pose(text: str, valid_poses: list[str]) -> str:
|
|
"""
|
|
Choose a pose based on simple keyword heuristics.
|
|
Used for Agent0 MCP responses, which are plain text not JSON.
|
|
"""
|
|
t = text.lower()
|
|
candidates: list[tuple[list[str], list[str]]] = [
|
|
(["error", "fail", "sorry", "cannot", "unable", "permission denied"], ["sorry", "annoyed", "bored"]),
|
|
(["done", "complete", "finished", "success", "pulled", "updated", "deployed", "pushed"], ["engaged", "interested", "naughty"]),
|
|
(["?", "what", "which", "how", "why", "could you"], ["inquisitive"]),
|
|
(["warning", "careful", "note", "attention"], ["annoyed", "bored"]),
|
|
]
|
|
for keywords, poses in candidates:
|
|
if any(k in t for k in keywords):
|
|
for p in poses:
|
|
if p in valid_poses:
|
|
return p
|
|
for p in ("neutral", "neutral2", "engaged"):
|
|
if p in valid_poses:
|
|
return p
|
|
return valid_poses[0] if valid_poses else "neutral"
|
|
|
|
# ── Agent0 MCP connection ─────────────────────────────────────────────────────
|
|
|
|
async def handle_agent0_mcp(
|
|
req: ChatRequest,
|
|
agent: AgentProfile,
|
|
model: ModelConfig,
|
|
valid_poses: list[str],
|
|
) -> ChatResponse:
|
|
"""
|
|
Send a message to a live Agent Zero instance via its MCP streamable-http
|
|
server and return the response as a ChatResponse.
|
|
|
|
MCP URL format: {endpoint}/mcp/t-{mcp_key}/http
|
|
Tool called: send_message (built into every Agent Zero instance)
|
|
|
|
Conversation continuity is maintained via Agent Zero's chat_id, which maps
|
|
to a gnommoweb conversation_id in _a0_sessions (in-memory).
|
|
"""
|
|
from mcp.client.streamable_http import streamablehttp_client
|
|
from mcp import ClientSession
|
|
|
|
endpoint = (model.endpoint or "").strip().rstrip("/")
|
|
mcp_key = model.mcp_key or ""
|
|
mcp_url = f"{endpoint}/mcp/t-{mcp_key}/http"
|
|
|
|
# Retrieve existing Agent0 chat_id for this conversation (if any)
|
|
a0_chat_id = _a0_sessions.get(req.conversation_id) if req.conversation_id else None
|
|
|
|
log.info(
|
|
f"[{agent.name}] → Agent0 MCP url={mcp_url} "
|
|
f"conv={req.conversation_id} a0_chat={a0_chat_id or 'new'} "
|
|
f"msg='{req.message[:60]}'"
|
|
)
|
|
|
|
try:
|
|
async with streamablehttp_client(mcp_url) as (read, write, _):
|
|
async with ClientSession(read, write) as session:
|
|
await session.initialize()
|
|
|
|
tool_args: dict = {
|
|
"message": req.message,
|
|
"persistent_chat": True,
|
|
}
|
|
if a0_chat_id:
|
|
tool_args["chat_id"] = a0_chat_id
|
|
|
|
result = await session.call_tool("send_message", tool_args)
|
|
|
|
# The tool returns a JSON-serialised ToolResponse / ToolError
|
|
raw = result.content[0].text if result.content else "{}"
|
|
log.info(f"[{agent.name}] ← Agent0 MCP raw: {raw[:300]}")
|
|
|
|
parsed = json.loads(raw)
|
|
|
|
if parsed.get("status") == "error":
|
|
raise RuntimeError(parsed.get("error", "Unknown Agent0 error"))
|
|
|
|
response_text = parsed.get("response", "")
|
|
new_chat_id = parsed.get("chat_id", "")
|
|
|
|
# Persist the Agent0 chat_id so the next turn continues the same context
|
|
if new_chat_id and req.conversation_id is not None:
|
|
_a0_sessions[req.conversation_id] = new_chat_id
|
|
|
|
pose = pick_pose(response_text, valid_poses)
|
|
|
|
return ChatResponse(
|
|
letter_id=0,
|
|
timestamp=int(time.time()),
|
|
message=response_text,
|
|
pose=pose,
|
|
conversation_id=req.conversation_id,
|
|
)
|
|
|
|
except Exception as e:
|
|
log.error(f"[{agent.name}] Agent0 MCP error: {e}")
|
|
fallback_pose = next(
|
|
(p for p in ("sorry", "annoyed", "neutral") if p in valid_poses),
|
|
valid_poses[0],
|
|
)
|
|
return ChatResponse(
|
|
letter_id=0,
|
|
timestamp=int(time.time()),
|
|
message="*connection interference* The response channel is temporarily disrupted. Try again in a moment.",
|
|
pose=fallback_pose,
|
|
conversation_id=req.conversation_id,
|
|
)
|
|
|
|
# ── Hermes Agent connection ───────────────────────────────────────────────────
|
|
|
|
async def _fetch_hermes_token(endpoint: str) -> str:
|
|
"""
|
|
Fetch the ephemeral session token injected into the Hermes dashboard HTML.
|
|
|
|
The token is generated fresh on every server start (secrets.token_urlsafe(32))
|
|
and embedded as: window.__HERMES_SESSION_TOKEN__="<token>"
|
|
|
|
Caches the result per endpoint; callers should invalidate on 401.
|
|
"""
|
|
cached = _hermes_token_cache.get(endpoint)
|
|
if cached:
|
|
return cached
|
|
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
resp = await client.get(f"{endpoint}/")
|
|
resp.raise_for_status()
|
|
html = resp.text
|
|
|
|
m = re.search(r'__HERMES_SESSION_TOKEN__\s*=\s*"([^"]+)"', html)
|
|
if not m:
|
|
raise RuntimeError(
|
|
f"Could not extract Hermes session token from {endpoint}/ "
|
|
"(is the dashboard running?)"
|
|
)
|
|
|
|
token = m.group(1)
|
|
_hermes_token_cache[endpoint] = token
|
|
log.info(f"[hermes] fetched new session token from {endpoint}")
|
|
return token
|
|
|
|
|
|
async def handle_hermes(
|
|
req: "ChatRequest",
|
|
agent: AgentProfile,
|
|
model: ModelConfig,
|
|
valid_poses: list[str],
|
|
) -> "ChatResponse":
|
|
"""
|
|
Send a message to a Hermes Agent dashboard via JSON-RPC 2.0 WebSocket.
|
|
|
|
Protocol:
|
|
1. Fetch ephemeral session token from dashboard HTML (cached, re-fetched on 401).
|
|
2. Open WebSocket at {endpoint}/api/ws?token={token}
|
|
3. JSON-RPC session.create {} → { session_id }
|
|
4. JSON-RPC prompt.submit { session_id, text } → (ack)
|
|
5. Accumulate message.delta payloads until message.complete
|
|
6. Return assembled text with pose heuristic.
|
|
|
|
Conversation continuity: gnommoweb conversation_id → Hermes session_id
|
|
stored in _hermes_sessions (in-memory, cleared on restart).
|
|
"""
|
|
try:
|
|
from websockets.asyncio.client import connect as ws_connect
|
|
except ImportError:
|
|
from websockets.client import connect as ws_connect # type: ignore
|
|
|
|
endpoint = (model.endpoint or "").strip().rstrip("/")
|
|
hermes_session_id = _hermes_sessions.get(req.conversation_id) if req.conversation_id else None
|
|
|
|
token = await _fetch_hermes_token(endpoint)
|
|
|
|
ws_scheme = "wss" if endpoint.startswith("https") else "ws"
|
|
ws_url = f"{ws_scheme}://{endpoint.split('://', 1)[-1]}/api/ws?token={token}"
|
|
|
|
log.info(
|
|
f"[{agent.name}] → Hermes url={endpoint} "
|
|
f"conv={req.conversation_id} hermes_sess={hermes_session_id or 'new'} "
|
|
f"msg='{req.message[:60]}'"
|
|
)
|
|
|
|
async def _do_chat(tok: str) -> ChatResponse:
|
|
nonlocal hermes_session_id
|
|
ws_url_inner = f"{ws_scheme}://{endpoint.split('://', 1)[-1]}/api/ws?token={tok}"
|
|
|
|
async with ws_connect(ws_url_inner) as ws:
|
|
req_id = 0
|
|
|
|
async def rpc(method: str, params: dict) -> dict:
|
|
nonlocal req_id
|
|
req_id += 1
|
|
rid = f"h{req_id}"
|
|
await ws.send(json.dumps({"jsonrpc": "2.0", "id": rid, "method": method, "params": params}))
|
|
# Wait for the matching response (skip events that arrive first)
|
|
while True:
|
|
raw = await ws.recv()
|
|
msg = json.loads(raw)
|
|
if msg.get("id") == rid:
|
|
if "error" in msg:
|
|
raise RuntimeError(msg["error"].get("message", "RPC error"))
|
|
return msg.get("result") or {}
|
|
# Event frame — queue it for later (ignored for now, we don't miss
|
|
# message.delta here because prompt.submit is called after create)
|
|
|
|
# ── 1. Create or reuse session ────────────────────────────────
|
|
if not hermes_session_id:
|
|
created = await rpc("session.create", {})
|
|
hermes_session_id = created.get("session_id") or created.get("id", "")
|
|
if req.conversation_id is not None and hermes_session_id:
|
|
_hermes_sessions[req.conversation_id] = hermes_session_id
|
|
log.info(f"[{agent.name}] created Hermes session {hermes_session_id}")
|
|
|
|
# ── 2. Submit prompt ──────────────────────────────────────────
|
|
# Send prompt.submit but do NOT wait for its ack via rpc() —
|
|
# the ack arrives interleaved with events (after message.start).
|
|
# We drain all frames below until message.complete arrives.
|
|
submit_id = f"h{req_id + 1}"
|
|
req_id += 1
|
|
await ws.send(json.dumps({
|
|
"jsonrpc": "2.0", "id": submit_id,
|
|
"method": "prompt.submit",
|
|
"params": {"session_id": hermes_session_id, "text": req.message},
|
|
}))
|
|
|
|
# ── 3. Stream events until message.complete ───────────────────
|
|
full_text = ""
|
|
while True:
|
|
raw = await asyncio.wait_for(ws.recv(), timeout=120.0)
|
|
msg = json.loads(raw)
|
|
|
|
# RPC ack for prompt.submit — ignore, keep reading
|
|
if msg.get("id") == submit_id:
|
|
continue
|
|
|
|
if msg.get("method") != "event":
|
|
continue
|
|
|
|
params = msg.get("params") or {}
|
|
etype = params.get("type", "")
|
|
payload = params.get("payload") or {}
|
|
|
|
if etype == "message.delta":
|
|
full_text += payload.get("text", "")
|
|
elif etype == "message.complete":
|
|
# payload.text holds the full assembled response
|
|
complete_text = payload.get("text", "")
|
|
if complete_text:
|
|
full_text = complete_text
|
|
if payload.get("status") == "error":
|
|
raise RuntimeError(full_text or "Hermes returned an error response")
|
|
break
|
|
elif etype == "error":
|
|
raise RuntimeError(payload.get("message", "Hermes error"))
|
|
|
|
pose = pick_pose(full_text, valid_poses)
|
|
return ChatResponse(
|
|
letter_id=0,
|
|
timestamp=int(time.time()),
|
|
message=full_text or "*silence*",
|
|
pose=pose,
|
|
conversation_id=req.conversation_id,
|
|
)
|
|
|
|
try:
|
|
return await _do_chat(token)
|
|
except Exception as e:
|
|
err_str = str(e)
|
|
# 401 / rejected — token likely regenerated; clear cache and retry once
|
|
if "401" in err_str or "403" in err_str or "Unauthorized" in err_str.lower():
|
|
log.warning(f"[{agent.name}] Hermes token rejected, re-fetching…")
|
|
_hermes_token_cache.pop(endpoint, None)
|
|
try:
|
|
token = await _fetch_hermes_token(endpoint)
|
|
return await _do_chat(token)
|
|
except Exception as retry_err:
|
|
log.error(f"[{agent.name}] Hermes retry failed: {retry_err}")
|
|
err_str = str(retry_err)
|
|
|
|
log.error(f"[{agent.name}] Hermes error: {err_str}")
|
|
fallback_pose = next(
|
|
(p for p in ("sorry", "annoyed", "neutral") if p in valid_poses),
|
|
valid_poses[0],
|
|
)
|
|
return ChatResponse(
|
|
letter_id=0,
|
|
timestamp=int(time.time()),
|
|
message="*connection interference* The response channel is temporarily disrupted. Try again in a moment.",
|
|
pose=fallback_pose,
|
|
conversation_id=req.conversation_id,
|
|
)
|
|
|
|
|
|
# ── Fallback agent ────────────────────────────────────────────────────────────
|
|
|
|
def make_fallback_agent() -> AgentProfile:
|
|
return AgentProfile(
|
|
name="Agent",
|
|
role="University Agent",
|
|
poses=["neutral", "bored", "annoyed", "engaged", "closed"],
|
|
identity_document="You are a helpful university agent.",
|
|
)
|
|
|
|
# ── App ───────────────────────────────────────────────────────────────────────
|
|
|
|
app = FastAPI(title="Agent Inference Service", version="1.2.0")
|
|
|
|
@app.get("/health")
|
|
async def health():
|
|
return {"status": "ok", "service": "agent-inference"}
|
|
|
|
|
|
@app.post("/v1/agent/chat", response_model=ChatResponse)
|
|
async def agent_chat(req: ChatRequest, authorization: str = Header(default="")):
|
|
if authorization != f"Bearer {API_KEY}":
|
|
raise HTTPException(status_code=401, detail="Unauthorised.")
|
|
|
|
agent = req.agent or make_fallback_agent()
|
|
valid_poses = agent.poses if agent.poses else ["neutral"]
|
|
model = select_model(req.models)
|
|
|
|
log.info(
|
|
f"[route] agent={agent.name!r} agent_type={agent.agent_type!r} "
|
|
f"models_payload={req.models} "
|
|
f"selected_model={model}"
|
|
)
|
|
|
|
# ── Route: Hermes Agent ───────────────────────────────────────────────────
|
|
if agent.agent_type == "hermes":
|
|
if not model or not model.endpoint:
|
|
log.error(f"[{agent.name}] Hermes agent has no model with endpoint configured")
|
|
return ChatResponse(
|
|
letter_id=0,
|
|
timestamp=int(time.time()),
|
|
message="*configuration error* No Hermes endpoint configured for this agent.",
|
|
pose=valid_poses[0],
|
|
conversation_id=req.conversation_id,
|
|
)
|
|
return await handle_hermes(req, agent, model, valid_poses)
|
|
|
|
# ── Route: Agent0 MCP ─────────────────────────────────────────────────────
|
|
if agent.agent_type == "agent0" or (model and model.type == "agent0"):
|
|
return await handle_agent0_mcp(req, agent, model, valid_poses)
|
|
|
|
# ── Route: standard LLM via litellm ──────────────────────────────────────
|
|
if not model or not model.model_id:
|
|
log.error(f"[{agent.name}] No inference endpoint configured for this agent")
|
|
return ChatResponse(
|
|
letter_id=0,
|
|
timestamp=int(time.time()),
|
|
message="*configuration error* No inference endpoint configured for this agent.",
|
|
pose=valid_poses[0],
|
|
conversation_id=req.conversation_id,
|
|
)
|
|
|
|
model_id = model.model_id
|
|
api_base = model.endpoint
|
|
api_key = model.api_key
|
|
|
|
total_messages = len(req.history) + 1
|
|
system_prompt = build_system_prompt(agent, message_count=total_messages)
|
|
|
|
# Structure: [system] + history + [current user message]
|
|
messages = [{"role": "system", "content": system_prompt}]
|
|
for msg in req.history:
|
|
role = msg.get("role", "user")
|
|
content = msg.get("content", "")
|
|
if role not in ("user", "assistant"):
|
|
role = "user"
|
|
messages.append({"role": role, "content": content})
|
|
messages.append({"role": "user", "content": req.message})
|
|
|
|
log.info(
|
|
f"[{agent.name}] → {model_id} "
|
|
f"heaviness={req.models.heaviness if req.models else 1} "
|
|
f"history={len(req.history)} msgs "
|
|
f"prompt={len(system_prompt)} chars "
|
|
f"user={req.user_id} "
|
|
f"msg='{req.message[:60]}'"
|
|
)
|
|
|
|
completion_kwargs = dict(
|
|
model=model_id,
|
|
messages=messages,
|
|
temperature=0.8,
|
|
max_tokens=400,
|
|
)
|
|
if api_base:
|
|
completion_kwargs["api_base"] = api_base
|
|
if api_key:
|
|
completion_kwargs["api_key"] = api_key
|
|
|
|
try:
|
|
response = await litellm.acompletion(**completion_kwargs)
|
|
raw = response.choices[0].message.content.strip()
|
|
log.info(f"[{agent.name}] ← {raw[:200]}")
|
|
|
|
# Parse JSON response
|
|
try:
|
|
parsed = json.loads(raw)
|
|
except json.JSONDecodeError:
|
|
start = raw.find("{")
|
|
end = raw.rfind("}") + 1
|
|
if start >= 0 and end > start:
|
|
parsed = json.loads(raw[start:end])
|
|
else:
|
|
parsed = {"message": raw, "pose": "neutral"}
|
|
|
|
# Validate pose
|
|
pose = parsed.get("pose", "neutral")
|
|
if pose not in valid_poses:
|
|
log.warning(f"[{agent.name}] Invalid pose '{pose}', defaulting to '{valid_poses[0]}'")
|
|
pose = valid_poses[0]
|
|
|
|
# Validate optional actions
|
|
raw_actions = parsed.get("actions")
|
|
actions = None
|
|
if isinstance(raw_actions, list) and raw_actions:
|
|
actions = [
|
|
{"emoji": a["emoji"], "label": a["label"]}
|
|
for a in raw_actions
|
|
if isinstance(a, dict) and "emoji" in a and "label" in a
|
|
] or None
|
|
|
|
return ChatResponse(
|
|
letter_id=0,
|
|
timestamp=int(time.time()),
|
|
message=parsed.get("message", raw),
|
|
pose=pose,
|
|
conversation_id=req.conversation_id,
|
|
actions=actions,
|
|
)
|
|
|
|
except Exception as e:
|
|
log.error(f"[{agent.name}] LLM error: {e}")
|
|
fallback_pose = next(
|
|
(p for p in ("annoyed", "bored", "sorry", "neutral") if p in valid_poses),
|
|
valid_poses[0],
|
|
)
|
|
return ChatResponse(
|
|
letter_id=0,
|
|
timestamp=int(time.time()),
|
|
message="*connection interference* The response channel is temporarily disrupted. Try again in a moment.",
|
|
pose=fallback_pose,
|
|
conversation_id=req.conversation_id,
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import uvicorn
|
|
port = int(os.getenv("AGENT_INFERENCE_PORT", "8089"))
|
|
log.info(f"Starting agent inference service on port {port}")
|
|
uvicorn.run(app, host="0.0.0.0", port=port)
|