From bc8631c49bfd0579635381196b4e4c27fdc2a10b Mon Sep 17 00:00:00 2001 From: jenstandstad Date: Sun, 26 Apr 2026 17:41:29 +0200 Subject: [PATCH] Initial commit: agent-inference service Moved from gnommoweb/agent-inference. Generic LLM inference bridge supporting litellm (anthropic/openai/ollama/lm_studio), Agent Zero MCP, and Hermes JSON-RPC WebSocket agent types. Co-Authored-By: Claude Sonnet 4.6 --- .dockerignore | 3 + .gitignore | 6 + Dockerfile | 12 + dev.sh | 110 ++++++++ requirements.txt | 6 + server.py | 662 +++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 799 insertions(+) create mode 100644 .dockerignore create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100755 dev.sh create mode 100644 requirements.txt create mode 100644 server.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..d594efd --- /dev/null +++ b/.dockerignore @@ -0,0 +1,3 @@ +__pycache__ +*.pyc +.env diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d14b35d --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.venv/ +__pycache__/ +*.pyc +*.pyo +.env +dev.log diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..bb6ccec --- /dev/null +++ b/Dockerfile @@ -0,0 +1,12 @@ +FROM python:3.11-slim + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY server.py . + +EXPOSE 8089 + +CMD ["python", "server.py"] diff --git a/dev.sh b/dev.sh new file mode 100755 index 0000000..756a358 --- /dev/null +++ b/dev.sh @@ -0,0 +1,110 @@ +#!/bin/bash + +# dev.sh - Run Agent inference inference service locally +# +# Usage: +# ./dev.sh [--port ] +# +# Options: +# --port N Port to listen on (default: 8089) +# +# Requires: Python 3, pip (installs deps from requirements.txt if missing) +# Logs are written to ./dev.log in this directory. + +set -e + +# ── Parse arguments ──────────────────────────────────────────────────────────── + +PORT=8089 + +while [[ $# -gt 0 ]]; do + case "$1" in + --port) PORT="$2"; shift 2 ;; + *) echo "Unknown option: $1" >&2; exit 1 ;; + esac +done + +# ── Setup ───────────────────────────────────────────────────────────────────── + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +LOG_FILE="$SCRIPT_DIR/dev.log" + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' + +log() { + local color="$1"; shift + echo -e "${color}$*${NC}" + echo "$*" >> "$LOG_FILE" +} + +echo "=== dev.sh started $(date -u '+%Y-%m-%d %H:%M:%S UTC') ===" > "$LOG_FILE" +echo "Service: agent-inference Port: ${PORT}" >> "$LOG_FILE" + +log "$GREEN" "Starting Agent inference inference service on port ${PORT}..." +log "$GREEN" " Log file : ${LOG_FILE}" + +# Pull latest changes +log "$GREEN" "Pulling latest changes..." +git -C "$SCRIPT_DIR" pull --ff-only >> "$LOG_FILE" 2>&1 \ + || log "$YELLOW" "Warning: git pull failed — continuing with local state" + +# ── Python environment ───────────────────────────────────────────────────────── + +if [ ! -d "$SCRIPT_DIR/.venv" ]; then + log "$YELLOW" "Creating virtual environment..." + python3 -m venv "$SCRIPT_DIR/.venv" >> "$LOG_FILE" 2>&1 +fi + +source "$SCRIPT_DIR/.venv/bin/activate" + +if [ -f "$SCRIPT_DIR/requirements.txt" ]; then + log "$GREEN" "Installing dependencies..." + pip install -q -r "$SCRIPT_DIR/requirements.txt" >> "$LOG_FILE" 2>&1 +fi + +# ── Load .env ────────────────────────────────────────────────────────────────── + +if [ -f "$SCRIPT_DIR/.env" ]; then + log "$GREEN" "✓ Loading .env file" + set -a + source "$SCRIPT_DIR/.env" + set +a +fi + +export AGENT_INFERENCE_PORT="$PORT" + +# ── Cleanup trap ─────────────────────────────────────────────────────────────── + +cleanup() { + echo "" + log "$YELLOW" "Shutting down Agent inference inference..." + kill $SERVER_PID 2>/dev/null || true + echo "=== dev.sh stopped $(date -u '+%Y-%m-%d %H:%M:%S UTC') ===" >> "$LOG_FILE" + exit 0 +} + +trap cleanup SIGINT SIGTERM + +# ── Start server ─────────────────────────────────────────────────────────────── + +log "$GREEN" "Starting uvicorn on port ${PORT}..." +cd "$SCRIPT_DIR" +uvicorn server:app --host 0.0.0.0 --port "$PORT" --reload >> "$LOG_FILE" 2>&1 & +SERVER_PID=$! + +echo "" +echo -e "${GREEN}============================================${NC}" +echo -e "${GREEN} Agent inference inference is running!${NC}" +echo -e "${GREEN}============================================${NC}" +echo "" +echo -e " Service : ${YELLOW}http://localhost:${PORT}${NC}" +echo -e " Health : ${YELLOW}http://localhost:${PORT}/health${NC}" +echo -e " Log file : ${YELLOW}${LOG_FILE}${NC}" +echo "" +echo -e " Press ${RED}Ctrl+C${NC} to stop" +echo "" + +wait diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8852145 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +fastapi==0.135.1 +uvicorn==0.41.0 +litellm>=1.80.0 +mcp>=1.9.0 +httpx>=0.27.0 +websockets>=13.0 diff --git a/server.py b/server.py new file mode 100644 index 0000000..f2e6dd8 --- /dev/null +++ b/server.py @@ -0,0 +1,662 @@ +#!/usr/bin/env python3 +"""Agent Inference Service + +A generic LLM inference service that can serve any configured agent. +Receives the full agent profile (identity, knowledge, guardrails) and model +configuration in each request. + +Three connection modes are supported, selected by agent_type (primary) or +the endpoint's type field: + + agent_type = "standard" / type = anthropic|openai|ollama|lm_studio + Assembles a full system prompt from the agent profile and calls the model + via litellm. The LLM must respond with JSON {"message", "pose"}. + + agent_type = "agent0" / type = "agent0" + Connects to a live Agent Zero instance via its MCP server (streamable-http). + Calls the built-in send_message tool, which has full tool use, memory, and + persistent context. Pose is chosen heuristically from the plain-text reply. + MCP URL: {endpoint}/mcp/t-{mcp_key}/http + + agent_type = "hermes" + Connects to a Hermes Agent dashboard via JSON-RPC 2.0 WebSocket. + Fetches the ephemeral session token from the dashboard HTML automatically. + WebSocket: {endpoint}/api/ws?token={token} + Session: session.create → prompt.submit → stream message.delta events. + +Nothing is summarised or truncated by this service. Context-window management +for LLM calls is the responsibility of the caller (gnommoweb). +""" + +import asyncio +import os +import json +import re +import time +import logging +from fastapi import FastAPI, HTTPException, Header +from pydantic import BaseModel, Field +from typing import Optional +import httpx +import litellm + +# --- Config --- +API_KEY = os.getenv("AGENT_INFERENCE_KEY", "agent-inference-dev-key") +FALLBACK_MODEL = os.getenv("AGENT_INFERENCE_MODEL", "anthropic/claude-sonnet-4-20250514") +ANTHROPIC_KEY = os.getenv("API_KEY_ANTHROPIC", "") +AGENT_MAX_MESSAGES = int(os.getenv("AGENT_INFERENCE_MAX_MESSAGES", "10")) + +if ANTHROPIC_KEY: + os.environ["ANTHROPIC_API_KEY"] = ANTHROPIC_KEY + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger("agent-inference") + +# ── In-memory Agent0 session map ────────────────────────────────────────────── +# Maps gnommoweb conversation_id → Agent Zero chat_id (context id). +# Allows conversation continuity across turns for Agent0 MCP connections. +# Cleared on service restart — acceptable for current single-user usage. +_a0_sessions: dict[int, str] = {} + +# ── In-memory Hermes session map ─────────────────────────────────────────────── +# Maps gnommoweb conversation_id → Hermes session_id. +# Also caches the dashboard token per endpoint (re-fetched on 401). +_hermes_sessions: dict[int, str] = {} +_hermes_token_cache: dict[str, str] = {} # endpoint → session token + +# ── Request / Response models ───────────────────────────────────────────────── + +class ModelConfig(BaseModel): + name: Optional[str] = None + model_id: str = "" # litellm model string; label-only for Agent0 MCP + endpoint: Optional[str] = None # base URL; api_base for litellm, or Agent0 host + api_key: Optional[str] = None # LLM API key; not used for Agent0 MCP + type: Optional[str] = None # anthropic | openai | ollama | lm_studio | hermes | agent0 + mcp_key: Optional[str] = None # Agent0 MCP token + +class ModelsPayload(BaseModel): + heaviness: int = 1 # 1=light, 2=medium, 3=heavy + light: Optional[ModelConfig] = None + medium: Optional[ModelConfig] = None + heavy: Optional[ModelConfig] = None + +class AgentProfile(BaseModel): + """Full agent profile received from gnommoweb. All fields sent verbatim.""" + name: str + role: str = "" + from_name: str = "" + poses: list[str] = Field(default_factory=list) + identity_document: str = "" + job_description: str = "" + guardrails: str = "" + best_practices: str = "" + knowledge_base: str = "" + mock_responses: list[str] = Field(default_factory=list) + agent_type: str = "standard" # standard | agent0 | hermes + +class ChatRequest(BaseModel): + user_id: int + agent_id: int = 1 + message: str + conversation_id: Optional[int] = None + history: list = Field(default_factory=list) + agent: Optional[AgentProfile] = None + models: Optional[ModelsPayload] = None + +class ChatResponse(BaseModel): + letter_id: int = 0 + timestamp: int + message: str + pose: str + conversation_id: Optional[int] = None + actions: Optional[list] = None + +# ── System prompt (LLM path only) ───────────────────────────────────────────── + +def build_system_prompt(agent: AgentProfile, message_count: int = 0) -> str: + """ + Assemble the complete system prompt for one LLM call. + + Sections (all verbatim from the agent's DB fields): + 1. Identity anchor — always present; name + role + from_name + 2. Identity document — full persona, backstory, values (if set) + 3. Job description — responsibilities and objectives (if set) + 4. Guardrails — hard constraints (if set) + 5. Best practices — response style and soft rules (if set) + 6. Knowledge base — factual grounding (if set) + 7. Poses — complete list of valid pose identifiers + 8. Response format — JSON output structure + 9. Wrap-up hint — injected near the conversation limit + """ + parts = [] + + # ── 1. Identity anchor (always present) ────────────────────────────────── + anchor_lines = [f"You are {agent.name}"] + if agent.role: + anchor_lines[0] += f", {agent.role}" + if agent.from_name and agent.from_name not in (agent.name, f"{agent.name}, {agent.role}"): + anchor_lines.append(f"You are known as: {agent.from_name}") + anchor_lines.append( + "You MUST maintain this character and persona throughout the entire conversation. " + "Do NOT break character. Do NOT acknowledge being an AI or a language model." + ) + parts.append("\n".join(anchor_lines)) + + # ── 2. Identity document (verbatim) ────────────────────────────────────── + if agent.identity_document: + parts.append(f"## IDENTITY\n\n{agent.identity_document}") + + # ── 3. Job description (verbatim) ──────────────────────────────────────── + if agent.job_description: + parts.append(f"## YOUR ROLE\n\n{agent.job_description}") + + # ── 4. Guardrails (verbatim) ───────────────────────────────────────────── + if agent.guardrails: + parts.append(f"## GUARDRAILS — FOLLOW EXACTLY\n\n{agent.guardrails}") + + # ── 5. Best practices (verbatim) ───────────────────────────────────────── + if agent.best_practices: + parts.append(f"## BEST PRACTICES\n\n{agent.best_practices}") + + # ── 6. Knowledge base (verbatim) ───────────────────────────────────────── + if agent.knowledge_base: + parts.append(f"## KNOWLEDGE BASE\n\n{agent.knowledge_base}") + + # ── 7. Poses ────────────────────────────────────────────────────────────── + valid_poses = agent.poses if agent.poses else ["neutral"] + pose_lines = [ + "## POSES", + "", + "Every response MUST include a pose field set to one of these exact identifiers:", + "", + ] + for p in valid_poses: + pose_lines.append(f" {p}") + pose_lines += [ + "", + "Choose the pose that best reflects your character's genuine emotional state in this moment.", + "Pose names are descriptive — use them accordingly.", + "Any identifier not in the list above will be rejected.", + ] + parts.append("\n".join(pose_lines)) + + # ── 8. Response format ──────────────────────────────────────────────────── + parts.append( + "## RESPONSE FORMAT\n\n" + "Respond ONLY with a valid JSON object. No prose before or after it.\n\n" + "Required fields:\n" + ' "message" — your in-character response text\n' + ' "pose" — one identifier from the POSES list above\n\n' + "Optional field:\n" + ' "actions" — array of action buttons to show the user, e.g.:\n' + ' [{"emoji": "🫳🏽", "label": "Do"}, {"emoji": "👁️", "label": "Look"}]\n' + " Omit entirely to keep the current action buttons unchanged.\n" + " Maximum 4 actions. The Leave action is always implicit.\n\n" + "Example:\n" + '{\n' + ' "message": "Your response here.",\n' + ' "pose": "neutral"\n' + '}' + ) + + # ── 9. Wrap-up hint (injected near the conversation limit) ──────────────── + wrap_up_threshold = max(AGENT_MAX_MESSAGES - 3, 4) + if message_count >= AGENT_MAX_MESSAGES: + parts.append( + "## END THIS CONVERSATION NOW\n\n" + "This conversation has reached its limit. You MUST wrap up in this response. " + "Thank the user, guide them toward whatever next step suits your role, " + "and set your pose to 'closed'. This is your final message." + ) + elif message_count >= wrap_up_threshold: + parts.append( + "NOTE: This conversation is approaching its limit. " + "Begin guiding the user toward a natural conclusion." + ) + + return "\n\n".join(parts) + +# ── Model selection ─────────────────────────────────────────────────────────── + +def select_model(payload: Optional[ModelsPayload]) -> Optional[ModelConfig]: + """ + Pick the ModelConfig based on requested heaviness. + Fallback chain: requested heaviness → lower levels → None (use env fallback). + """ + if not payload: + return None + + heaviness = payload.heaviness + candidates = [] + if heaviness >= 3 and payload.heavy: + candidates.append(payload.heavy) + if heaviness >= 2 and payload.medium: + candidates.append(payload.medium) + if payload.light: + candidates.append(payload.light) + + chosen = candidates[0] if candidates else None + if not chosen: + log.warning(f"No model configured for heaviness={heaviness}, falling back to env model") + return chosen + +# ── Pose heuristic (Agent0 MCP path) ───────────────────────────────────────── + +def pick_pose(text: str, valid_poses: list[str]) -> str: + """ + Choose a pose based on simple keyword heuristics. + Used for Agent0 MCP responses, which are plain text not JSON. + """ + t = text.lower() + candidates: list[tuple[list[str], list[str]]] = [ + (["error", "fail", "sorry", "cannot", "unable", "permission denied"], ["sorry", "annoyed", "bored"]), + (["done", "complete", "finished", "success", "pulled", "updated", "deployed", "pushed"], ["engaged", "interested", "naughty"]), + (["?", "what", "which", "how", "why", "could you"], ["inquisitive"]), + (["warning", "careful", "note", "attention"], ["annoyed", "bored"]), + ] + for keywords, poses in candidates: + if any(k in t for k in keywords): + for p in poses: + if p in valid_poses: + return p + for p in ("neutral", "neutral2", "engaged"): + if p in valid_poses: + return p + return valid_poses[0] if valid_poses else "neutral" + +# ── Agent0 MCP connection ───────────────────────────────────────────────────── + +async def handle_agent0_mcp( + req: ChatRequest, + agent: AgentProfile, + model: ModelConfig, + valid_poses: list[str], +) -> ChatResponse: + """ + Send a message to a live Agent Zero instance via its MCP streamable-http + server and return the response as a ChatResponse. + + MCP URL format: {endpoint}/mcp/t-{mcp_key}/http + Tool called: send_message (built into every Agent Zero instance) + + Conversation continuity is maintained via Agent Zero's chat_id, which maps + to a gnommoweb conversation_id in _a0_sessions (in-memory). + """ + from mcp.client.streamable_http import streamablehttp_client + from mcp import ClientSession + + endpoint = (model.endpoint or "").rstrip("/") + mcp_key = model.mcp_key or "" + mcp_url = f"{endpoint}/mcp/t-{mcp_key}/http" + + # Retrieve existing Agent0 chat_id for this conversation (if any) + a0_chat_id = _a0_sessions.get(req.conversation_id) if req.conversation_id else None + + log.info( + f"[{agent.name}] → Agent0 MCP url={mcp_url} " + f"conv={req.conversation_id} a0_chat={a0_chat_id or 'new'} " + f"msg='{req.message[:60]}'" + ) + + try: + async with streamablehttp_client(mcp_url) as (read, write, _): + async with ClientSession(read, write) as session: + await session.initialize() + + tool_args: dict = { + "message": req.message, + "persistent_chat": True, + } + if a0_chat_id: + tool_args["chat_id"] = a0_chat_id + + result = await session.call_tool("send_message", tool_args) + + # The tool returns a JSON-serialised ToolResponse / ToolError + raw = result.content[0].text if result.content else "{}" + log.info(f"[{agent.name}] ← Agent0 MCP raw: {raw[:300]}") + + parsed = json.loads(raw) + + if parsed.get("status") == "error": + raise RuntimeError(parsed.get("error", "Unknown Agent0 error")) + + response_text = parsed.get("response", "") + new_chat_id = parsed.get("chat_id", "") + + # Persist the Agent0 chat_id so the next turn continues the same context + if new_chat_id and req.conversation_id is not None: + _a0_sessions[req.conversation_id] = new_chat_id + + pose = pick_pose(response_text, valid_poses) + + return ChatResponse( + letter_id=0, + timestamp=int(time.time()), + message=response_text, + pose=pose, + conversation_id=req.conversation_id, + ) + + except Exception as e: + log.error(f"[{agent.name}] Agent0 MCP error: {e}") + fallback_pose = next( + (p for p in ("sorry", "annoyed", "neutral") if p in valid_poses), + valid_poses[0], + ) + return ChatResponse( + letter_id=0, + timestamp=int(time.time()), + message="*connection interference* The response channel is temporarily disrupted. Try again in a moment.", + pose=fallback_pose, + conversation_id=req.conversation_id, + ) + +# ── Hermes Agent connection ─────────────────────────────────────────────────── + +async def _fetch_hermes_token(endpoint: str) -> str: + """ + Fetch the ephemeral session token injected into the Hermes dashboard HTML. + + The token is generated fresh on every server start (secrets.token_urlsafe(32)) + and embedded as: window.__HERMES_SESSION_TOKEN__="" + + Caches the result per endpoint; callers should invalidate on 401. + """ + cached = _hermes_token_cache.get(endpoint) + if cached: + return cached + + async with httpx.AsyncClient(timeout=10.0) as client: + resp = await client.get(f"{endpoint}/") + resp.raise_for_status() + html = resp.text + + m = re.search(r'__HERMES_SESSION_TOKEN__\s*=\s*"([^"]+)"', html) + if not m: + raise RuntimeError( + f"Could not extract Hermes session token from {endpoint}/ " + "(is the dashboard running?)" + ) + + token = m.group(1) + _hermes_token_cache[endpoint] = token + log.info(f"[hermes] fetched new session token from {endpoint}") + return token + + +async def handle_hermes( + req: "ChatRequest", + agent: AgentProfile, + model: ModelConfig, + valid_poses: list[str], +) -> "ChatResponse": + """ + Send a message to a Hermes Agent dashboard via JSON-RPC 2.0 WebSocket. + + Protocol: + 1. Fetch ephemeral session token from dashboard HTML (cached, re-fetched on 401). + 2. Open WebSocket at {endpoint}/api/ws?token={token} + 3. JSON-RPC session.create {} → { session_id } + 4. JSON-RPC prompt.submit { session_id, text } → (ack) + 5. Accumulate message.delta payloads until message.complete + 6. Return assembled text with pose heuristic. + + Conversation continuity: gnommoweb conversation_id → Hermes session_id + stored in _hermes_sessions (in-memory, cleared on restart). + """ + try: + from websockets.asyncio.client import connect as ws_connect + except ImportError: + from websockets.client import connect as ws_connect # type: ignore + + endpoint = (model.endpoint or "").rstrip("/") + hermes_session_id = _hermes_sessions.get(req.conversation_id) if req.conversation_id else None + + token = await _fetch_hermes_token(endpoint) + + ws_scheme = "wss" if endpoint.startswith("https") else "ws" + ws_url = f"{ws_scheme}://{endpoint.split('://', 1)[-1]}/api/ws?token={token}" + + log.info( + f"[{agent.name}] → Hermes url={endpoint} " + f"conv={req.conversation_id} hermes_sess={hermes_session_id or 'new'} " + f"msg='{req.message[:60]}'" + ) + + async def _do_chat(tok: str) -> ChatResponse: + nonlocal hermes_session_id + ws_url_inner = f"{ws_scheme}://{endpoint.split('://', 1)[-1]}/api/ws?token={tok}" + + async with ws_connect(ws_url_inner) as ws: + req_id = 0 + + async def rpc(method: str, params: dict) -> dict: + nonlocal req_id + req_id += 1 + rid = f"h{req_id}" + await ws.send(json.dumps({"jsonrpc": "2.0", "id": rid, "method": method, "params": params})) + # Wait for the matching response (skip events that arrive first) + while True: + raw = await ws.recv() + msg = json.loads(raw) + if msg.get("id") == rid: + if "error" in msg: + raise RuntimeError(msg["error"].get("message", "RPC error")) + return msg.get("result") or {} + # Event frame — queue it for later (ignored for now, we don't miss + # message.delta here because prompt.submit is called after create) + + # ── 1. Create or reuse session ──────────────────────────────── + if not hermes_session_id: + created = await rpc("session.create", {}) + hermes_session_id = created.get("session_id") or created.get("id", "") + if req.conversation_id is not None and hermes_session_id: + _hermes_sessions[req.conversation_id] = hermes_session_id + log.info(f"[{agent.name}] created Hermes session {hermes_session_id}") + + # ── 2. Submit prompt ────────────────────────────────────────── + await rpc("prompt.submit", {"session_id": hermes_session_id, "text": req.message}) + + # ── 3. Stream events until message.complete ─────────────────── + full_text = "" + while True: + raw = await asyncio.wait_for(ws.recv(), timeout=120.0) + msg = json.loads(raw) + if msg.get("method") != "event": + continue + params = msg.get("params") or {} + etype = params.get("type", "") + payload = params.get("payload") or {} + if etype == "message.delta": + full_text += payload.get("text", "") + elif etype == "message.complete": + break + elif etype == "error": + raise RuntimeError(payload.get("message", "Hermes error")) + + pose = pick_pose(full_text, valid_poses) + return ChatResponse( + letter_id=0, + timestamp=int(time.time()), + message=full_text or "*silence*", + pose=pose, + conversation_id=req.conversation_id, + ) + + try: + return await _do_chat(token) + except Exception as e: + err_str = str(e) + # 401 / rejected — token likely regenerated; clear cache and retry once + if "401" in err_str or "403" in err_str or "Unauthorized" in err_str.lower(): + log.warning(f"[{agent.name}] Hermes token rejected, re-fetching…") + _hermes_token_cache.pop(endpoint, None) + try: + token = await _fetch_hermes_token(endpoint) + return await _do_chat(token) + except Exception as retry_err: + log.error(f"[{agent.name}] Hermes retry failed: {retry_err}") + err_str = str(retry_err) + + log.error(f"[{agent.name}] Hermes error: {err_str}") + fallback_pose = next( + (p for p in ("sorry", "annoyed", "neutral") if p in valid_poses), + valid_poses[0], + ) + return ChatResponse( + letter_id=0, + timestamp=int(time.time()), + message="*connection interference* The response channel is temporarily disrupted. Try again in a moment.", + pose=fallback_pose, + conversation_id=req.conversation_id, + ) + + +# ── Fallback agent ──────────────────────────────────────────────────────────── + +def make_fallback_agent() -> AgentProfile: + return AgentProfile( + name="Agent", + role="University Agent", + poses=["neutral", "bored", "annoyed", "engaged", "closed"], + identity_document="You are a helpful university agent.", + ) + +# ── App ─────────────────────────────────────────────────────────────────────── + +app = FastAPI(title="Agent Inference Service", version="1.2.0") + +@app.get("/health") +async def health(): + return {"status": "ok", "service": "agent-inference", "fallback_model": FALLBACK_MODEL} + + +@app.post("/v1/agent/chat", response_model=ChatResponse) +async def agent_chat(req: ChatRequest, authorization: str = Header(default="")): + if authorization != f"Bearer {API_KEY}": + raise HTTPException(status_code=401, detail="Unauthorised.") + + agent = req.agent or make_fallback_agent() + valid_poses = agent.poses if agent.poses else ["neutral"] + model = select_model(req.models) + + # ── Route: Hermes Agent ─────────────────────────────────────────────────── + if agent.agent_type == "hermes": + if not model or not model.endpoint: + log.error(f"[{agent.name}] Hermes agent has no model with endpoint configured") + return ChatResponse( + letter_id=0, + timestamp=int(time.time()), + message="*configuration error* No Hermes endpoint configured for this agent.", + pose=valid_poses[0], + conversation_id=req.conversation_id, + ) + return await handle_hermes(req, agent, model, valid_poses) + + # ── Route: Agent0 MCP ───────────────────────────────────────────────────── + if agent.agent_type == "agent0" or (model and model.type == "agent0"): + return await handle_agent0_mcp(req, agent, model, valid_poses) + + # ── Route: standard LLM via litellm ────────────────────────────────────── + model_id = model.model_id if model else FALLBACK_MODEL + api_base = model.endpoint if model else None + api_key = model.api_key if model else None + + if not model_id: + model_id = FALLBACK_MODEL + + total_messages = len(req.history) + 1 + system_prompt = build_system_prompt(agent, message_count=total_messages) + + # Structure: [system] + history + [current user message] + messages = [{"role": "system", "content": system_prompt}] + for msg in req.history: + role = msg.get("role", "user") + content = msg.get("content", "") + if role not in ("user", "assistant"): + role = "user" + messages.append({"role": role, "content": content}) + messages.append({"role": "user", "content": req.message}) + + log.info( + f"[{agent.name}] → {model_id} " + f"heaviness={req.models.heaviness if req.models else 1} " + f"history={len(req.history)} msgs " + f"prompt={len(system_prompt)} chars " + f"user={req.user_id} " + f"msg='{req.message[:60]}'" + ) + + completion_kwargs = dict( + model=model_id, + messages=messages, + temperature=0.8, + max_tokens=400, + ) + if api_base: + completion_kwargs["api_base"] = api_base + if api_key: + completion_kwargs["api_key"] = api_key + + try: + response = await litellm.acompletion(**completion_kwargs) + raw = response.choices[0].message.content.strip() + log.info(f"[{agent.name}] ← {raw[:200]}") + + # Parse JSON response + try: + parsed = json.loads(raw) + except json.JSONDecodeError: + start = raw.find("{") + end = raw.rfind("}") + 1 + if start >= 0 and end > start: + parsed = json.loads(raw[start:end]) + else: + parsed = {"message": raw, "pose": "neutral"} + + # Validate pose + pose = parsed.get("pose", "neutral") + if pose not in valid_poses: + log.warning(f"[{agent.name}] Invalid pose '{pose}', defaulting to '{valid_poses[0]}'") + pose = valid_poses[0] + + # Validate optional actions + raw_actions = parsed.get("actions") + actions = None + if isinstance(raw_actions, list) and raw_actions: + actions = [ + {"emoji": a["emoji"], "label": a["label"]} + for a in raw_actions + if isinstance(a, dict) and "emoji" in a and "label" in a + ] or None + + return ChatResponse( + letter_id=0, + timestamp=int(time.time()), + message=parsed.get("message", raw), + pose=pose, + conversation_id=req.conversation_id, + actions=actions, + ) + + except Exception as e: + log.error(f"[{agent.name}] LLM error: {e}") + fallback_pose = next( + (p for p in ("annoyed", "bored", "sorry", "neutral") if p in valid_poses), + valid_poses[0], + ) + return ChatResponse( + letter_id=0, + timestamp=int(time.time()), + message="*connection interference* The response channel is temporarily disrupted. Try again in a moment.", + pose=fallback_pose, + conversation_id=req.conversation_id, + ) + + +if __name__ == "__main__": + import uvicorn + port = int(os.getenv("AGENT_INFERENCE_PORT", "8089")) + log.info(f"Starting agent inference service on port {port}") + uvicorn.run(app, host="0.0.0.0", port=port)