From 44633ac6d67fd91de7e2cbbbfd92299e99b5f0bb Mon Sep 17 00:00:00 2001
From: jenstandstad <jens.tandstad@gmail.com>
Date: Sat, 9 May 2026 19:36:03 +0200
Subject: [PATCH] Modifications to many skills

---
 agents/gerhard-hermes/channel_directory.json  |   7 +-
 agents/gerhard-hermes/gateway.lock            |   2 +-
 agents/gerhard-hermes/gateway.pid             |   2 +-
 agents/gerhard-hermes/gateway_state.json      |   2 +-
 .../gerhard-hermes/skills/.bundled_manifest   |  65 +-
 .../skills/apple/apple-notes/SKILL.md         |   2 +-
 .../skills/apple/apple-reminders/SKILL.md     |   2 +-
 .../skills/apple/findmy/SKILL.md              |   2 +-
 .../autonomous-ai-agents/claude-code/SKILL.md |   2 +-
 .../hermes-agent/SKILL.md                     |  62 +-
 .../creative/architecture-diagram/SKILL.md    |   2 +-
 .../skills/creative/ascii-art/SKILL.md        |   2 +-
 .../skills/creative/ascii-video/SKILL.md      |  10 +-
 .../skills/creative/baoyu-comic/SKILL.md      |   2 +-
 .../creative/baoyu-infographic/SKILL.md       |   2 +-
 .../creative/creative-ideation/SKILL.md       |   6 +-
 .../skills/creative/design-md/SKILL.md        |   8 +-
 .../skills/creative/p5js/SKILL.md             |  10 +-
 .../skills/creative/pixel-art/SKILL.md        |   2 +-
 .../songwriting-and-ai-music/SKILL.md         |   5 +-
 .../data-science/jupyter-live-kernel/SKILL.md |   7 +-
 .../devops/webhook-subscriptions/SKILL.md     |   2 +-
 agents/gerhard-hermes/skills/dogfood/SKILL.md |   2 +-
 .../skills/email/himalaya/SKILL.md            |   2 +-
 .../github/codebase-inspection/SKILL.md       |   2 +-
 .../skills/github/github-auth/SKILL.md        |   2 +-
 .../skills/github/github-code-review/SKILL.md |   2 +-
 .../skills/github/github-issues/SKILL.md      |   2 +-
 .../skills/github/github-pr-workflow/SKILL.md |   2 +-
 .../github/github-repo-management/SKILL.md    |   2 +-
 .../skills/mcp/native-mcp/SKILL.md            |   2 +-
 .../skills/media/gif-search/SKILL.md          |   6 +-
 .../skills/media/songsee/SKILL.md             |   2 +-
 .../SKILL.md                                  |   0
 .../references/api-evaluation.md              | 490 ++++++++++++++
 .../references/benchmark-guide.md             | 488 ++++++++++++++
 .../references/custom-tasks.md                | 602 ++++++++++++++++++
 .../references/distributed-eval.md            | 519 +++++++++++++++
 .../skills/mlops/huggingface-hub/SKILL.md     |   2 +-
 .../skills/mlops/inference/outlines/SKILL.md  |   2 +-
 .../mlops/models/segment-anything/SKILL.md    |   2 +-
 .../mlops/training/trl-fine-tuning/SKILL.md   |   2 +-
 .../skills/mlops/training/unsloth/SKILL.md    |   2 +-
 .../skills/productivity/linear/SKILL.md       |   2 +-
 .../skills/productivity/maps/SKILL.md         |   7 +-
 .../productivity/maps/scripts/maps_client.py  |   5 -
 .../skills/productivity/nano-pdf/SKILL.md     |   2 +-
 .../skills/productivity/notion/SKILL.md       |   2 +-
 .../skills/productivity/powerpoint/SKILL.md   |   6 +-
 .../skills/software-development/plan/SKILL.md |   2 +-
 .../requesting-code-review/SKILL.md           |   5 +-
 .../subagent-driven-development/SKILL.md      |  11 +-
 .../references/context-budget-discipline.md   |  53 --
 .../references/gates-taxonomy.md              |  93 ---
 .../systematic-debugging/SKILL.md             |   2 +-
 .../test-driven-development/SKILL.md          |   2 +-
 .../writing-plans/SKILL.md                    |   2 +-
 agents/hermes/.gitkeep                        |   0
 .../agent0/prompts/agent.system.main.role.md  |   3 +
 .../plugins/_office/stale-cleanup-v2.done     |   1 +
 agents/hermes/scheduler/tasks.json            |   1 +
 scripts/pull-agent-identity.py                |  18 +-
 62 files changed, 2213 insertions(+), 345 deletions(-)
 rename agents/gerhard-hermes/skills/mlops/evaluation/{lm-evaluation-harness => lm-evaluation-harness.bak}/SKILL.md (100%)
 create mode 100644 agents/gerhard-hermes/skills/mlops/evaluation/lm-evaluation-harness.bak/references/api-evaluation.md
 create mode 100644 agents/gerhard-hermes/skills/mlops/evaluation/lm-evaluation-harness.bak/references/benchmark-guide.md
 create mode 100644 agents/gerhard-hermes/skills/mlops/evaluation/lm-evaluation-harness.bak/references/custom-tasks.md
 create mode 100644 agents/gerhard-hermes/skills/mlops/evaluation/lm-evaluation-harness.bak/references/distributed-eval.md
 delete mode 100644 agents/gerhard-hermes/skills/software-development/subagent-driven-development/references/context-budget-discipline.md
 delete mode 100644 agents/gerhard-hermes/skills/software-development/subagent-driven-development/references/gates-taxonomy.md
 create mode 100644 agents/hermes/.gitkeep
 create mode 100644 agents/hermes/agents/agent0/prompts/agent.system.main.role.md
 create mode 100644 agents/hermes/plugins/_office/stale-cleanup-v2.done
 create mode 100644 agents/hermes/scheduler/tasks.json

diff --git a/agents/gerhard-hermes/channel_directory.json b/agents/gerhard-hermes/channel_directory.json
index b967cf5..5e7545c 100644
--- a/agents/gerhard-hermes/channel_directory.json
+++ b/agents/gerhard-hermes/channel_directory.json
@@ -1,5 +1,5 @@
 {
-  "updated_at": "2026-05-09T13:32:28.095733",
+  "updated_at": "2026-05-09T17:16:32.631120",
   "platforms": {
     "telegram": [],
     "discord": [],
@@ -17,9 +17,6 @@
     "wecom_callback": [],
     "weixin": [],
     "bluebubbles": [],
-    "qqbot": [],
-    "yuanbao": [],
-    "irc": [],
-    "teams": []
+    "qqbot": []
   }
 }
\ No newline at end of file
diff --git a/agents/gerhard-hermes/gateway.lock b/agents/gerhard-hermes/gateway.lock
index ab7a409..6fb6f4b 100644
--- a/agents/gerhard-hermes/gateway.lock
+++ b/agents/gerhard-hermes/gateway.lock
@@ -1 +1 @@
-{"pid": 7, "kind": "hermes-gateway", "argv": ["/opt/hermes/.venv/bin/hermes", "gateway", "run"], "start_time": 1823057}
\ No newline at end of file
+{"pid": 7, "kind": "hermes-gateway", "argv": ["/opt/hermes/.venv/bin/hermes", "gateway", "run"], "start_time": 97270363}
\ No newline at end of file
diff --git a/agents/gerhard-hermes/gateway.pid b/agents/gerhard-hermes/gateway.pid
index ab7a409..6fb6f4b 100755
--- a/agents/gerhard-hermes/gateway.pid
+++ b/agents/gerhard-hermes/gateway.pid
@@ -1 +1 @@
-{"pid": 7, "kind": "hermes-gateway", "argv": ["/opt/hermes/.venv/bin/hermes", "gateway", "run"], "start_time": 1823057}
\ No newline at end of file
+{"pid": 7, "kind": "hermes-gateway", "argv": ["/opt/hermes/.venv/bin/hermes", "gateway", "run"], "start_time": 97270363}
\ No newline at end of file
diff --git a/agents/gerhard-hermes/gateway_state.json b/agents/gerhard-hermes/gateway_state.json
index bf75e35..622fbb3 100644
--- a/agents/gerhard-hermes/gateway_state.json
+++ b/agents/gerhard-hermes/gateway_state.json
@@ -1 +1 @@
-{"pid":7,"kind":"hermes-gateway","argv":["/opt/hermes/.venv/bin/hermes","gateway","run"],"start_time":1823057,"gateway_state":"running","exit_reason":null,"restart_requested":false,"active_agents":0,"platforms":{},"updated_at":"2026-05-09T13:32:28.084678+00:00"}
\ No newline at end of file
+{"pid": 7, "kind": "hermes-gateway", "argv": ["/opt/hermes/.venv/bin/hermes", "gateway", "run"], "start_time": 97270363, "gateway_state": "running", "exit_reason": null, "restart_requested": false, "active_agents": 0, "platforms": {}, "updated_at": "2026-05-09T17:16:32.582158+00:00"}
\ No newline at end of file
diff --git a/agents/gerhard-hermes/skills/.bundled_manifest b/agents/gerhard-hermes/skills/.bundled_manifest
index 0ba5045..83f1ceb 100644
--- a/agents/gerhard-hermes/skills/.bundled_manifest
+++ b/agents/gerhard-hermes/skills/.bundled_manifest
@@ -1,89 +1,74 @@
-airtable:dec8bcab05383e0ca8ae0e3c241d3a48
-apple-notes:5e448abf984561fb33b197045ce41388
-apple-reminders:cda2963c73800643faf4a34ef813879a
-architecture-diagram:8ed67034726b0ac3639d9c009d166222
+apple-notes:16ffca134c5590714781d8aeef51f8f3
+apple-reminders:0273a9a17f6d07c55c84735c4366186b
+architecture-diagram:999ab6d4445dbd407a82031857aa9791
 arxiv:0ad5eb32727a1cb2bbff9e1e8e4dbff7
 ascii-art:6eed9eb0c7cedf2bccd3cb7b7c91271c
-ascii-video:ab08372213418d643c81445fe759c28e
+ascii-video:93697173a0a33f7ecb7c4dc1c27f80e8
 audiocraft-audio-generation:41d06b6ec94d1cdb3d864efe452780fd
 axolotl:710b8e88805a85efc461dcd70c937cae
 baoyu-comic:0be1250d5433538d71a4ab6d81b359dc
-baoyu-infographic:567069c2548a69eafcbce09c028438dd
+baoyu-infographic:d00f808010611c77d3fe00f58d2d7176
 blogwatcher:d0b55ef6acff9ad26f1febace610ca3b
 claude-code:88bbb9f0e26f8148141da379e4e837c5
-claude-design:6607092a7d19705b9647067a09afd733
-codebase-inspection:97bf36f290117abc11ffde72535713e2
+codebase-inspection:5b1f99e926f347fe7c8c2658c9cc15b9
 codex:79bb6b5d9b47453cd0d7ac25df5a3c97
-comfyui:d6f42584ff328d6aa6a4b2e8e678c030
-debugging-hermes-tui-commands:f992bee7976a1d0f59884fa57e58f314
-design-md:a09844075e6e856a4a256dbc5f9e899a
-dogfood:77ff237be7db22a4ef3850b411d915ed
+design-md:267d0d8c363c9809744d1c62d561805e
+dogfood:fc03244c3237e6b7325dc8aef387f2e3
 dspy:5e0770e2563d11d9d4cc040681277c1c
 evaluating-llms-harness:784cd66354b654dedf7541cd9b9e4c91
 excalidraw:1679ad1d31a591fa3cb636d9150adcc7
-findmy:1d7dd3ae39cf25357a374c6bfb956442
-fine-tuning-with-trl:f73c765998375978e9fe529cafa6054a
+findmy:bd50940d7b0104f6d6bf8981fc54b827
+fine-tuning-with-trl:b2f0948b0f6e7202a452d9569bbd8f64
 gif-search:dc9206e5c5c2d648774864df5222c95f
-github-auth:6afa4cccb1eacad83dcdae2930b818a9
-github-code-review:41071b74c0222d4e784de8f0927f757d
-github-issues:3e4d98c7a6b1ebd0a55c752abb7a612b
-github-pr-workflow:834e9cd72f18ea4598934d8d253b5858
-github-repo-management:8479a9fb418f8dcfbbb191caaeccaa37
+github-auth:909ef9bbff492b214a625179f704c09a
+github-code-review:e56793f8efef112bbcdad96f69b45ddd
+github-issues:ecb864a88aeea8f88f5b8742fec8806b
+github-pr-workflow:cab1d57b84e253dddff37bd212f469ca
+github-repo-management:7d7131b113d4dc2509a47501a6638e76
 godmode:c592b460bf06e1f31b51bc6ac299e111
 google-workspace:cf9028aff358f6c6b6ebc183672ad947
 heartmula:ce53b2e6c9d68238cae5ae727738ecde
-hermes-agent:286e1312a50b53f11b9714f506989e4f
-hermes-agent-skill-authoring:d5b8b704b92d44ffa1e44f8b3d795037
+hermes-agent:1c55510fc8a7a8c0fee3134866ca5dc2
 himalaya:9da608734d1af8dab132406492bd5828
-huggingface-hub:c02809f64f3a534ad1970e094474f04f
-humanizer:0a006757e41d605ba0818ecca10288ed
+huggingface-hub:14002a449cb5f9a5ff8bdc7f730bcb2f
 ideation:0d1719daa364f2c5badd40c94620360f
 imessage:f545da0f5cc64dd9ee1ffd2b7733a11b
-jupyter-live-kernel:54612d9f0ff1b5eb6564f2dfeb5102b7
-kanban-orchestrator:1636b60c79180ee89108727bff9383c7
-kanban-worker:bc9124639762b2a5c20cd85580ae92e4
+jupyter-live-kernel:6bda9690d8c71095ac738bd9825e32f2
 linear:ab7a5dbd4001e31e2bd888d86ab699f8
 llama-cpp:fcfa4c23d52ac84abccf0b38e9844e07
 llm-wiki:9cb710c49d1af6fdba54d06a835a5498
 manim-video:86ba8c24fdd57771d68bea812d3b2466
-maps:5c8bb0a45921760a9c8f598ebfe7631e
+maps:285f3436aafadf452fac8c0bb5715e40
 minecraft-modpack-server:3cc682f8aef5f86d3580601ba28f9ba3
 nano-pdf:dd55aca10b8e2844a0cda3c68c757e83
-native-mcp:5564a9d31ce4165b532c575a315ddca4
-node-inspect-debugger:e8f38e8586a090b880edcdbcba67ec76
-notion:e24ae292897a6ca7837867864bc82c3c
+native-mcp:a8644a4f45c8403c1ad3342230b5c154
+notion:ac54a68c490d4cf1604bc24160083d43
 obliteratus:98dfcbfcad4416d27d5dcbd0a491d772
 obsidian:1dde562f384c6dc5eaec0b7c214caab4
 ocr-and-documents:0fe461668b245d894370b8b40c3eb012
 opencode:e3583bfa72da47385f6466eaf226faef
 openhue:0487b4695a071cc62da64c79935bc8d1
-outlines:ac034ba450bf3d0d05eb736dddcd117f
-p5js:5879c824a5487d6553d9380e37aa9c5e
+outlines:8efbd31f1252f6c8fb340db4d9dcce2f
+p5js:80de285f6ef54c19c22e4eafd1877fe4
 pixel-art:f94fe511926a222052ec8d2dc892b112
 plan:6a014103919a9b11d60e2d6267055871
 pokemon-player:2a30ed51c1179b22967fb4a33e6e57e4
 polymarket:b4a7d758f2fb29efb290dce1094cc625
 popular-web-designs:a77ef442dcf747d8d534f5acb6b6f0cf
 powerpoint:6ae6326c8fc5ff5a67b8e5283437ec30
-pretext:1a72b0c0b65188ce43917cac6d5b8973
-python-debugpy:d40cd39a90885e2c5ac7be13bbf5e832
-requesting-code-review:f76de34aee69387c297cf982c85fd6fe
+requesting-code-review:f9cc90df11a9ce1cc23595c574eacd75
 research-paper-writing:e1fa7bb71e73fbc74ea017720f971e9a
 segment-anything-model:a2403c1bf179c28cbac2ba7d56357b69
 serving-llms-vllm:a8b5453a5316da8df055a0f23c3cbd25
-sketch:56b3e77b9ff82d38fe1c7b8c6067de5d
 songsee:7738e32bff3ca9ec32b37b32e0a8c9ca
 songwriting-and-ai-music:65b4a6757901021ca16d9c8ecab62f7c
-spike:a1034fab3d8669745ee75474dd9c3a6b
 spotify:af733b32166f235fe3e0026e213ff2d4
 subagent-driven-development:3d4c3f5060b7e1577fc3306b9ca36ffd
 systematic-debugging:a02cf3ccd7b79909137ac1af46d01ed6
 test-driven-development:32bc0784dc0720a9e536ba1ce559fedf
-touchdesigner-mcp:3a428984eb83905c5ae89d0abf0ef866
 unsloth:6482bcde01d0a9aeaddc247932c3c69c
 webhook-subscriptions:edce3200566edfa7259718b51b8f52f3
 weights-and-biases:91fd048a0b693f6d74a4639ea08bbd1d
-writing-plans:c91061baf59682c9b10a317b5ff25617
+writing-plans:5b72a4318524fd7ffb37fd43e51e3954
 xurl:97a1749bd7274b93c631d71d2cf92e52
 youtube-content:c448e213097433492d51a063d34eb9ae
-yuanbao:69fa2e9e8b534a633443d47262e86855
diff --git a/agents/gerhard-hermes/skills/apple/apple-notes/SKILL.md b/agents/gerhard-hermes/skills/apple/apple-notes/SKILL.md
index 020f0d6..33fb3ef 100644
--- a/agents/gerhard-hermes/skills/apple/apple-notes/SKILL.md
+++ b/agents/gerhard-hermes/skills/apple/apple-notes/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: apple-notes
-description: "Manage Apple Notes via memo CLI: create, search, edit."
+description: Manage Apple Notes via the memo CLI on macOS (create, view, search, edit).
 version: 1.0.0
 author: Hermes Agent
 license: MIT
diff --git a/agents/gerhard-hermes/skills/apple/apple-reminders/SKILL.md b/agents/gerhard-hermes/skills/apple/apple-reminders/SKILL.md
index 37c4fa7..7af3933 100644
--- a/agents/gerhard-hermes/skills/apple/apple-reminders/SKILL.md
+++ b/agents/gerhard-hermes/skills/apple/apple-reminders/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: apple-reminders
-description: "Apple Reminders via remindctl: add, list, complete."
+description: Manage Apple Reminders via remindctl CLI (list, add, complete, delete).
 version: 1.0.0
 author: Hermes Agent
 license: MIT
diff --git a/agents/gerhard-hermes/skills/apple/findmy/SKILL.md b/agents/gerhard-hermes/skills/apple/findmy/SKILL.md
index e2bed38..c009b3e 100644
--- a/agents/gerhard-hermes/skills/apple/findmy/SKILL.md
+++ b/agents/gerhard-hermes/skills/apple/findmy/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: findmy
-description: "Track Apple devices/AirTags via FindMy.app on macOS."
+description: Track Apple devices and AirTags via FindMy.app on macOS using AppleScript and screen capture.
 version: 1.0.0
 author: Hermes Agent
 license: MIT
diff --git a/agents/gerhard-hermes/skills/autonomous-ai-agents/claude-code/SKILL.md b/agents/gerhard-hermes/skills/autonomous-ai-agents/claude-code/SKILL.md
index cf7692c..0b39b5c 100644
--- a/agents/gerhard-hermes/skills/autonomous-ai-agents/claude-code/SKILL.md
+++ b/agents/gerhard-hermes/skills/autonomous-ai-agents/claude-code/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: claude-code
-description: "Delegate coding to Claude Code CLI (features, PRs)."
+description: Delegate coding tasks to Claude Code (Anthropic's CLI agent). Use for building features, refactoring, PR reviews, and iterative coding. Requires the claude CLI installed.
 version: 2.2.0
 author: Hermes Agent + Teknium
 license: MIT
diff --git a/agents/gerhard-hermes/skills/autonomous-ai-agents/hermes-agent/SKILL.md b/agents/gerhard-hermes/skills/autonomous-ai-agents/hermes-agent/SKILL.md
index d97b39f..4ed03a9 100644
--- a/agents/gerhard-hermes/skills/autonomous-ai-agents/hermes-agent/SKILL.md
+++ b/agents/gerhard-hermes/skills/autonomous-ai-agents/hermes-agent/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: hermes-agent
-description: "Configure, extend, or contribute to Hermes Agent."
+description: Complete guide to using and extending Hermes Agent — CLI usage, setup, configuration, spawning additional agents, gateway platforms, skills, voice, tools, profiles, and a concise contributor reference. Load this skill when helping users configure Hermes, troubleshoot issues, spawn agent instances, or make code contributions.
 version: 2.0.0
 author: Hermes Agent + Teknium
 license: MIT
@@ -115,7 +115,7 @@ hermes tools disable NAME   Disable a toolset
 
 hermes skills list          List installed skills
 hermes skills search QUERY  Search the skills hub
-hermes skills install ID    Install a skill (ID can be a hub identifier OR a direct https://…/SKILL.md URL; pass --name to override when frontmatter has no name)
+hermes skills install ID    Install a skill
 hermes skills inspect ID    Preview without installing
 hermes skills config        Enable/disable skills per platform
 hermes skills check         Check for updates
@@ -281,6 +281,7 @@ Type these during an interactive chat session.
 ### Utility
 ```
 /branch (/fork)      Branch the current session
+/btw                 Ephemeral side question (doesn't interrupt main task)
 /fast                Toggle priority/fast processing
 /browser             Open CDP browser connection
 /history             Show conversation history (CLI)
@@ -402,63 +403,6 @@ Tool changes take effect on `/reset` (new session). They do NOT apply mid-conver
 
 ---
 
-## Security & Privacy Toggles
-
-Common "why is Hermes doing X to my output / tool calls / commands?" toggles — and the exact commands to change them. Most of these need a fresh session (`/reset` in chat, or start a new `hermes` invocation) because they're read once at startup.
-
-### Secret redaction in tool output
-
-Secret redaction is **off by default** — tool output (terminal stdout, `read_file`, web content, subagent summaries, etc.) passes through unmodified. If the user wants Hermes to auto-mask strings that look like API keys, tokens, and secrets before they enter the conversation context and logs:
-
-```bash
-hermes config set security.redact_secrets true       # enable globally
-```
-
-**Restart required.** `security.redact_secrets` is snapshotted at import time — toggling it mid-session (e.g. via `export HERMES_REDACT_SECRETS=true` from a tool call) will NOT take effect for the running process. Tell the user to run `hermes config set security.redact_secrets true` in a terminal, then start a new session. This is deliberate — it prevents an LLM from flipping the toggle on itself mid-task.
-
-Disable again with:
-```bash
-hermes config set security.redact_secrets false
-```
-
-### PII redaction in gateway messages
-
-Separate from secret redaction. When enabled, the gateway hashes user IDs and strips phone numbers from the session context before it reaches the model:
-
-```bash
-hermes config set privacy.redact_pii true    # enable
-hermes config set privacy.redact_pii false   # disable (default)
-```
-
-### Command approval prompts
-
-By default (`approvals.mode: manual`), Hermes prompts the user before running shell commands flagged as destructive (`rm -rf`, `git reset --hard`, etc.). The modes are:
-
-- `manual` — always prompt (default)
-- `smart` — use an auxiliary LLM to auto-approve low-risk commands, prompt on high-risk
-- `off` — skip all approval prompts (equivalent to `--yolo`)
-
-```bash
-hermes config set approvals.mode smart       # recommended middle ground
-hermes config set approvals.mode off         # bypass everything (not recommended)
-```
-
-Per-invocation bypass without changing config:
-- `hermes --yolo …`
-- `export HERMES_YOLO_MODE=1`
-
-Note: YOLO / `approvals.mode: off` does NOT turn off secret redaction. They are independent.
-
-### Shell hooks allowlist
-
-Some shell-hook integrations require explicit allowlisting before they fire. Managed via `~/.hermes/shell-hooks-allowlist.json` — prompted interactively the first time a hook wants to run.
-
-### Disabling the web/browser/image-gen tools
-
-To keep the model away from network or media tools entirely, open `hermes tools` and toggle per-platform. Takes effect on next session (`/reset`). See the Tools & Skills section above.
-
----
-
 ## Voice & Transcription
 
 ### STT (Voice → Text)
diff --git a/agents/gerhard-hermes/skills/creative/architecture-diagram/SKILL.md b/agents/gerhard-hermes/skills/creative/architecture-diagram/SKILL.md
index a49a42c..1e1749d 100644
--- a/agents/gerhard-hermes/skills/creative/architecture-diagram/SKILL.md
+++ b/agents/gerhard-hermes/skills/creative/architecture-diagram/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: architecture-diagram
-description: "Dark-themed SVG architecture/cloud/infra diagrams as HTML."
+description: Generate dark-themed SVG diagrams of software systems and cloud infrastructure as standalone HTML files with inline SVG graphics. Semantic component colors (cyan=frontend, emerald=backend, violet=database, amber=cloud/AWS, rose=security, orange=message bus), JetBrains Mono font, grid background. Best suited for software architecture, cloud/VPC topology, microservice maps, service-mesh diagrams, database + API layer diagrams, security groups, message buses — anything that fits a tech-infra deck with a dark aesthetic. If a more specialized diagramming skill exists for the subject (scientific, educational, hand-drawn, animated, etc.), prefer that — otherwise this skill can also serve as a general-purpose SVG diagram fallback. Based on Cocoon AI's architecture-diagram-generator (MIT).
 version: 1.0.0
 author: Cocoon AI (hello@cocoon-ai.com), ported by Hermes Agent
 license: MIT
diff --git a/agents/gerhard-hermes/skills/creative/ascii-art/SKILL.md b/agents/gerhard-hermes/skills/creative/ascii-art/SKILL.md
index fe1f6bb..1afe7ff 100644
--- a/agents/gerhard-hermes/skills/creative/ascii-art/SKILL.md
+++ b/agents/gerhard-hermes/skills/creative/ascii-art/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: ascii-art
-description: "ASCII art: pyfiglet, cowsay, boxes, image-to-ascii."
+description: Generate ASCII art using pyfiglet (571 fonts), cowsay, boxes, toilet, image-to-ascii, remote APIs (asciified, ascii.co.uk), and LLM fallback. No API keys required.
 version: 4.0.0
 author: 0xbyt4, Hermes Agent
 license: MIT
diff --git a/agents/gerhard-hermes/skills/creative/ascii-video/SKILL.md b/agents/gerhard-hermes/skills/creative/ascii-video/SKILL.md
index 59843c0..704a561 100644
--- a/agents/gerhard-hermes/skills/creative/ascii-video/SKILL.md
+++ b/agents/gerhard-hermes/skills/creative/ascii-video/SKILL.md
@@ -1,18 +1,10 @@
 ---
 name: ascii-video
-description: "ASCII video: convert video/audio to colored ASCII MP4/GIF."
+description: "Production pipeline for ASCII art video — any format. Converts video/audio/images/generative input into colored ASCII character video output (MP4, GIF, image sequence). Covers: video-to-ASCII conversion, audio-reactive music visualizers, generative ASCII art animations, hybrid video+audio reactive, text/lyrics overlays, real-time terminal rendering. Use when users request: ASCII video, text art video, terminal-style video, character art animation, retro text visualization, audio visualizer in ASCII, converting video to ASCII art, matrix-style effects, or any animated ASCII output."
 ---
 
 # ASCII Video Production Pipeline
 
-## When to use
-
-Use when users request: ASCII video, text art video, terminal-style video, character art animation, retro text visualization, audio visualizer in ASCII, converting video to ASCII art, matrix-style effects, or any animated ASCII output.
-
-## What's inside
-
-Production pipeline for ASCII art video — any format. Converts video/audio/images/generative input into colored ASCII character video output (MP4, GIF, image sequence). Covers: video-to-ASCII conversion, audio-reactive music visualizers, generative ASCII art animations, hybrid video+audio reactive, text/lyrics overlays, real-time terminal rendering.
-
 ## Creative Standard
 
 This is visual art. ASCII characters are the medium; cinema is the standard.
diff --git a/agents/gerhard-hermes/skills/creative/baoyu-comic/SKILL.md b/agents/gerhard-hermes/skills/creative/baoyu-comic/SKILL.md
index 6b3bef6..d3c89ed 100644
--- a/agents/gerhard-hermes/skills/creative/baoyu-comic/SKILL.md
+++ b/agents/gerhard-hermes/skills/creative/baoyu-comic/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: baoyu-comic
-description: "Knowledge comics (知识漫画): educational, biography, tutorial."
+description: Knowledge comic creator supporting multiple art styles and tones. Creates original educational comics with detailed panel layouts and sequential image generation. Use when user asks to create "知识漫画", "教育漫画", "biography comic", "tutorial comic", or "Logicomix-style comic".
 version: 1.56.1
 author: 宝玉 (JimLiu)
 license: MIT
diff --git a/agents/gerhard-hermes/skills/creative/baoyu-infographic/SKILL.md b/agents/gerhard-hermes/skills/creative/baoyu-infographic/SKILL.md
index 740bd16..fea3499 100644
--- a/agents/gerhard-hermes/skills/creative/baoyu-infographic/SKILL.md
+++ b/agents/gerhard-hermes/skills/creative/baoyu-infographic/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: baoyu-infographic
-description: "Infographics: 21 layouts x 21 styles (信息图, 可视化)."
+description: Generate professional infographics with 21 layout types and 21 visual styles. Analyzes content, recommends layout×style combinations, and generates publication-ready infographics. Use when user asks to create "infographic", "visual summary", "信息图", "可视化", or "高密度信息大图".
 version: 1.56.1
 author: 宝玉 (JimLiu)
 license: MIT
diff --git a/agents/gerhard-hermes/skills/creative/creative-ideation/SKILL.md b/agents/gerhard-hermes/skills/creative/creative-ideation/SKILL.md
index 767e867..a5feba5 100644
--- a/agents/gerhard-hermes/skills/creative/creative-ideation/SKILL.md
+++ b/agents/gerhard-hermes/skills/creative/creative-ideation/SKILL.md
@@ -1,7 +1,7 @@
 ---
 name: ideation
 title: Creative Ideation — Constraint-Driven Project Generation
-description: "Generate project ideas via creative constraints."
+description: "Generate project ideas through creative constraints. Use when the user says 'I want to build something', 'give me a project idea', 'I'm bored', 'what should I make', 'inspire me', or any variant of 'I have tools but no direction'. Works for code, art, hardware, writing, tools, and anything that can be made."
 version: 1.0.0
 author: SHL0MS
 license: MIT
@@ -14,10 +14,6 @@ metadata:
 
 # Creative Ideation
 
-## When to use
-
-Use when the user says 'I want to build something', 'give me a project idea', 'I'm bored', 'what should I make', 'inspire me', or any variant of 'I have tools but no direction'. Works for code, art, hardware, writing, tools, and anything that can be made.
-
 Generate project ideas through creative constraints. Constraint + direction = creativity.
 
 ## How It Works
diff --git a/agents/gerhard-hermes/skills/creative/design-md/SKILL.md b/agents/gerhard-hermes/skills/creative/design-md/SKILL.md
index 5884a60..36c4138 100644
--- a/agents/gerhard-hermes/skills/creative/design-md/SKILL.md
+++ b/agents/gerhard-hermes/skills/creative/design-md/SKILL.md
@@ -1,13 +1,13 @@
 ---
 name: design-md
-description: Author/validate/export Google's DESIGN.md token spec files.
+description: Author, validate, diff, and export DESIGN.md files — Google's open-source format spec that gives coding agents a persistent, structured understanding of a design system (tokens + rationale in one file). Use when building a design system, porting style rules between projects, generating UI with consistent brand, or auditing accessibility/contrast.
 version: 1.0.0
 author: Hermes Agent
 license: MIT
 metadata:
   hermes:
     tags: [design, design-system, tokens, ui, accessibility, wcag, tailwind, dtcg, google]
-    related_skills: [popular-web-designs, claude-design, excalidraw, architecture-diagram]
+    related_skills: [popular-web-designs, excalidraw, architecture-diagram]
 ---
 
 # DESIGN.md Skill
@@ -31,9 +31,7 @@ diffs versions for regressions, and exports to Tailwind or W3C DTCG JSON.
 - User wants contrast / WCAG accessibility validation on their color palette
 
 For purely visual inspiration or layout examples, use `popular-web-designs`
-instead. For *process and taste* when designing a one-off HTML artifact
-from scratch (prototype, deck, landing page, component lab), use
-`claude-design`. This skill is for the *formal spec file* itself.
+instead. This skill is for the *formal spec file* itself.
 
 ## File anatomy
 
diff --git a/agents/gerhard-hermes/skills/creative/p5js/SKILL.md b/agents/gerhard-hermes/skills/creative/p5js/SKILL.md
index ff0a955..1b8e618 100644
--- a/agents/gerhard-hermes/skills/creative/p5js/SKILL.md
+++ b/agents/gerhard-hermes/skills/creative/p5js/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: p5js
-description: "p5.js sketches: gen art, shaders, interactive, 3D."
+description: "Production pipeline for interactive and generative visual art using p5.js. Creates browser-based sketches, generative art, data visualizations, interactive experiences, 3D scenes, audio-reactive visuals, and motion graphics — exported as HTML, PNG, GIF, MP4, or SVG. Covers: 2D/3D rendering, noise and particle systems, flow fields, shaders (GLSL), pixel manipulation, kinetic typography, WebGL scenes, audio analysis, mouse/keyboard interaction, and headless high-res export. Use when users request: p5.js sketches, creative coding, generative art, interactive visualizations, canvas animations, browser-based visual art, data viz, shader effects, or any p5.js project."
 version: 1.0.0
 metadata:
   hermes:
@@ -10,14 +10,6 @@ metadata:
 
 # p5.js Production Pipeline
 
-## When to use
-
-Use when users request: p5.js sketches, creative coding, generative art, interactive visualizations, canvas animations, browser-based visual art, data viz, shader effects, or any p5.js project.
-
-## What's inside
-
-Production pipeline for interactive and generative visual art using p5.js. Creates browser-based sketches, generative art, data visualizations, interactive experiences, 3D scenes, audio-reactive visuals, and motion graphics — exported as HTML, PNG, GIF, MP4, or SVG. Covers: 2D/3D rendering, noise and particle systems, flow fields, shaders (GLSL), pixel manipulation, kinetic typography, WebGL scenes, audio analysis, mouse/keyboard interaction, and headless high-res export.
-
 ## Creative Standard
 
 This is visual art rendered in the browser. The canvas is the medium; the algorithm is the brush.
diff --git a/agents/gerhard-hermes/skills/creative/pixel-art/SKILL.md b/agents/gerhard-hermes/skills/creative/pixel-art/SKILL.md
index 596712b..e123fc6 100644
--- a/agents/gerhard-hermes/skills/creative/pixel-art/SKILL.md
+++ b/agents/gerhard-hermes/skills/creative/pixel-art/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: pixel-art
-description: "Pixel art w/ era palettes (NES, Game Boy, PICO-8)."
+description: Convert images into retro pixel art with hardware-accurate palettes (NES, Game Boy, PICO-8, C64, etc.), and animate them into short videos. Presets cover arcade, SNES, and 10+ era-correct looks. Use `clarify` to let the user pick a style before generating.
 version: 2.0.0
 author: dodo-reach
 license: MIT
diff --git a/agents/gerhard-hermes/skills/creative/songwriting-and-ai-music/SKILL.md b/agents/gerhard-hermes/skills/creative/songwriting-and-ai-music/SKILL.md
index 84bc3bc..2f1fc72 100644
--- a/agents/gerhard-hermes/skills/creative/songwriting-and-ai-music/SKILL.md
+++ b/agents/gerhard-hermes/skills/creative/songwriting-and-ai-music/SKILL.md
@@ -1,6 +1,9 @@
 ---
 name: songwriting-and-ai-music
-description: "Songwriting craft and Suno AI music prompts."
+description: >
+  Songwriting craft, AI music generation prompts (Suno focus), parody/adaptation
+  techniques, phonetic tricks, and lessons learned. These are tools and ideas,
+  not rules. Break any of them when the art calls for it.
 tags: [songwriting, music, suno, parody, lyrics, creative]
 triggers:
   - writing a song
diff --git a/agents/gerhard-hermes/skills/data-science/jupyter-live-kernel/SKILL.md b/agents/gerhard-hermes/skills/data-science/jupyter-live-kernel/SKILL.md
index bfb4cd5..984cd9e 100644
--- a/agents/gerhard-hermes/skills/data-science/jupyter-live-kernel/SKILL.md
+++ b/agents/gerhard-hermes/skills/data-science/jupyter-live-kernel/SKILL.md
@@ -1,6 +1,11 @@
 ---
 name: jupyter-live-kernel
-description: "Iterative Python via live Jupyter kernel (hamelnb)."
+description: >
+  Use a live Jupyter kernel for stateful, iterative Python execution via hamelnb.
+  Load this skill when the task involves exploration, iteration, or inspecting
+  intermediate results — data science, ML experimentation, API exploration, or
+  building up complex code step-by-step. Uses terminal to run CLI commands against
+  a live Jupyter kernel. No new tools required.
 version: 1.0.0
 author: Hermes Agent
 license: MIT
diff --git a/agents/gerhard-hermes/skills/devops/webhook-subscriptions/SKILL.md b/agents/gerhard-hermes/skills/devops/webhook-subscriptions/SKILL.md
index 6e4e896..dd20a19 100644
--- a/agents/gerhard-hermes/skills/devops/webhook-subscriptions/SKILL.md
+++ b/agents/gerhard-hermes/skills/devops/webhook-subscriptions/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: webhook-subscriptions
-description: "Webhook subscriptions: event-driven agent runs."
+description: Create and manage webhook subscriptions for event-driven agent activation, or for direct push notifications (zero LLM cost). Use when the user wants external services to trigger agent runs OR push notifications to chats.
 version: 1.1.0
 metadata:
   hermes:
diff --git a/agents/gerhard-hermes/skills/dogfood/SKILL.md b/agents/gerhard-hermes/skills/dogfood/SKILL.md
index 2757352..b7ba366 100644
--- a/agents/gerhard-hermes/skills/dogfood/SKILL.md
+++ b/agents/gerhard-hermes/skills/dogfood/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: dogfood
-description: "Exploratory QA of web apps: find bugs, evidence, reports."
+description: Systematic exploratory QA testing of web applications — find bugs, capture evidence, and generate structured reports
 version: 1.0.0
 metadata:
   hermes:
diff --git a/agents/gerhard-hermes/skills/email/himalaya/SKILL.md b/agents/gerhard-hermes/skills/email/himalaya/SKILL.md
index b04a427..ddbf51a 100644
--- a/agents/gerhard-hermes/skills/email/himalaya/SKILL.md
+++ b/agents/gerhard-hermes/skills/email/himalaya/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: himalaya
-description: "Himalaya CLI: IMAP/SMTP email from terminal."
+description: CLI to manage emails via IMAP/SMTP. Use himalaya to list, read, write, reply, forward, search, and organize emails from the terminal. Supports multiple accounts and message composition with MML (MIME Meta Language).
 version: 1.0.0
 author: community
 license: MIT
diff --git a/agents/gerhard-hermes/skills/github/codebase-inspection/SKILL.md b/agents/gerhard-hermes/skills/github/codebase-inspection/SKILL.md
index b52b8d1..6954ad8 100644
--- a/agents/gerhard-hermes/skills/github/codebase-inspection/SKILL.md
+++ b/agents/gerhard-hermes/skills/github/codebase-inspection/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: codebase-inspection
-description: "Inspect codebases w/ pygount: LOC, languages, ratios."
+description: Inspect and analyze codebases using pygount for LOC counting, language breakdown, and code-vs-comment ratios. Use when asked to check lines of code, repo size, language composition, or codebase stats.
 version: 1.0.0
 author: Hermes Agent
 license: MIT
diff --git a/agents/gerhard-hermes/skills/github/github-auth/SKILL.md b/agents/gerhard-hermes/skills/github/github-auth/SKILL.md
index b4f0dde..ea8f369 100644
--- a/agents/gerhard-hermes/skills/github/github-auth/SKILL.md
+++ b/agents/gerhard-hermes/skills/github/github-auth/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: github-auth
-description: "GitHub auth setup: HTTPS tokens, SSH keys, gh CLI login."
+description: Set up GitHub authentication for the agent using git (universally available) or the gh CLI. Covers HTTPS tokens, SSH keys, credential helpers, and gh auth — with a detection flow to pick the right method automatically.
 version: 1.1.0
 author: Hermes Agent
 license: MIT
diff --git a/agents/gerhard-hermes/skills/github/github-code-review/SKILL.md b/agents/gerhard-hermes/skills/github/github-code-review/SKILL.md
index a2f1e54..8041fbb 100644
--- a/agents/gerhard-hermes/skills/github/github-code-review/SKILL.md
+++ b/agents/gerhard-hermes/skills/github/github-code-review/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: github-code-review
-description: "Review PRs: diffs, inline comments via gh or REST."
+description: Review code changes by analyzing git diffs, leaving inline comments on PRs, and performing thorough pre-push review. Works with gh CLI or falls back to git + GitHub REST API via curl.
 version: 1.1.0
 author: Hermes Agent
 license: MIT
diff --git a/agents/gerhard-hermes/skills/github/github-issues/SKILL.md b/agents/gerhard-hermes/skills/github/github-issues/SKILL.md
index fe6e6e0..a3bceb8 100644
--- a/agents/gerhard-hermes/skills/github/github-issues/SKILL.md
+++ b/agents/gerhard-hermes/skills/github/github-issues/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: github-issues
-description: "Create, triage, label, assign GitHub issues via gh or REST."
+description: Create, manage, triage, and close GitHub issues. Search existing issues, add labels, assign people, and link to PRs. Works with gh CLI or falls back to git + GitHub REST API via curl.
 version: 1.1.0
 author: Hermes Agent
 license: MIT
diff --git a/agents/gerhard-hermes/skills/github/github-pr-workflow/SKILL.md b/agents/gerhard-hermes/skills/github/github-pr-workflow/SKILL.md
index e3ca20f..48f15ed 100644
--- a/agents/gerhard-hermes/skills/github/github-pr-workflow/SKILL.md
+++ b/agents/gerhard-hermes/skills/github/github-pr-workflow/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: github-pr-workflow
-description: "GitHub PR lifecycle: branch, commit, open, CI, merge."
+description: Full pull request lifecycle — create branches, commit changes, open PRs, monitor CI status, auto-fix failures, and merge. Works with gh CLI or falls back to git + GitHub REST API via curl.
 version: 1.1.0
 author: Hermes Agent
 license: MIT
diff --git a/agents/gerhard-hermes/skills/github/github-repo-management/SKILL.md b/agents/gerhard-hermes/skills/github/github-repo-management/SKILL.md
index 0ca8830..b3732f2 100644
--- a/agents/gerhard-hermes/skills/github/github-repo-management/SKILL.md
+++ b/agents/gerhard-hermes/skills/github/github-repo-management/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: github-repo-management
-description: "Clone/create/fork repos; manage remotes, releases."
+description: Clone, create, fork, configure, and manage GitHub repositories. Manage remotes, secrets, releases, and workflows. Works with gh CLI or falls back to git + GitHub REST API via curl.
 version: 1.1.0
 author: Hermes Agent
 license: MIT
diff --git a/agents/gerhard-hermes/skills/mcp/native-mcp/SKILL.md b/agents/gerhard-hermes/skills/mcp/native-mcp/SKILL.md
index a14aa58..e56bf3f 100644
--- a/agents/gerhard-hermes/skills/mcp/native-mcp/SKILL.md
+++ b/agents/gerhard-hermes/skills/mcp/native-mcp/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: native-mcp
-description: "MCP client: connect servers, register tools (stdio/HTTP)."
+description: Built-in MCP (Model Context Protocol) client that connects to external MCP servers, discovers their tools, and registers them as native Hermes Agent tools. Supports stdio and HTTP transports with automatic reconnection, security filtering, and zero-config tool injection.
 version: 1.0.0
 author: Hermes Agent
 license: MIT
diff --git a/agents/gerhard-hermes/skills/media/gif-search/SKILL.md b/agents/gerhard-hermes/skills/media/gif-search/SKILL.md
index 373f319..ee55cac 100644
--- a/agents/gerhard-hermes/skills/media/gif-search/SKILL.md
+++ b/agents/gerhard-hermes/skills/media/gif-search/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: gif-search
-description: "Search/download GIFs from Tenor via curl + jq."
+description: Search and download GIFs from Tenor using curl. No dependencies beyond curl and jq. Useful for finding reaction GIFs, creating visual content, and sending GIFs in chat.
 version: 1.1.0
 author: Hermes Agent
 license: MIT
@@ -16,10 +16,6 @@ metadata:
 
 Search and download GIFs directly via the Tenor API using curl. No extra tools needed.
 
-## When to use
-
-Useful for finding reaction GIFs, creating visual content, and sending GIFs in chat.
-
 ## Setup
 
 Set your Tenor API key in your environment (add to `~/.hermes/.env`):
diff --git a/agents/gerhard-hermes/skills/media/songsee/SKILL.md b/agents/gerhard-hermes/skills/media/songsee/SKILL.md
index 5904e41..11bcca0 100644
--- a/agents/gerhard-hermes/skills/media/songsee/SKILL.md
+++ b/agents/gerhard-hermes/skills/media/songsee/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: songsee
-description: "Audio spectrograms/features (mel, chroma, MFCC) via CLI."
+description: Generate spectrograms and audio feature visualizations (mel, chroma, MFCC, tempogram, etc.) from audio files via CLI. Useful for audio analysis, music production debugging, and visual documentation.
 version: 1.0.0
 author: community
 license: MIT
diff --git a/agents/gerhard-hermes/skills/mlops/evaluation/lm-evaluation-harness/SKILL.md b/agents/gerhard-hermes/skills/mlops/evaluation/lm-evaluation-harness.bak/SKILL.md
similarity index 100%
rename from agents/gerhard-hermes/skills/mlops/evaluation/lm-evaluation-harness/SKILL.md
rename to agents/gerhard-hermes/skills/mlops/evaluation/lm-evaluation-harness.bak/SKILL.md
diff --git a/agents/gerhard-hermes/skills/mlops/evaluation/lm-evaluation-harness.bak/references/api-evaluation.md b/agents/gerhard-hermes/skills/mlops/evaluation/lm-evaluation-harness.bak/references/api-evaluation.md
new file mode 100644
index 0000000..db77f61
--- /dev/null
+++ b/agents/gerhard-hermes/skills/mlops/evaluation/lm-evaluation-harness.bak/references/api-evaluation.md
@@ -0,0 +1,490 @@
+# API Evaluation
+
+Guide to evaluating OpenAI, Anthropic, and other API-based language models.
+
+## Overview
+
+The lm-evaluation-harness supports evaluating API-based models through a unified `TemplateAPI` interface. This allows benchmarking of:
+- OpenAI models (GPT-4, GPT-3.5, etc.)
+- Anthropic models (Claude 3, Claude 2, etc.)
+- Local OpenAI-compatible APIs
+- Custom API endpoints
+
+**Why evaluate API models**:
+- Benchmark closed-source models
+- Compare API models to open models
+- Validate API performance
+- Track model updates over time
+
+## Supported API Models
+
+| Provider | Model Type | Request Types | Logprobs |
+|----------|------------|---------------|----------|
+| OpenAI (completions) | `openai-completions` | All | ✅ Yes |
+| OpenAI (chat) | `openai-chat-completions` | `generate_until` only | ❌ No |
+| Anthropic (completions) | `anthropic-completions` | All | ❌ No |
+| Anthropic (chat) | `anthropic-chat` | `generate_until` only | ❌ No |
+| Local (OpenAI-compatible) | `local-completions` | Depends on server | Varies |
+
+**Note**: Models without logprobs can only be evaluated on generation tasks, not perplexity or loglikelihood tasks.
+
+## OpenAI Models
+
+### Setup
+
+```bash
+export OPENAI_API_KEY=sk-...
+```
+
+### Completion Models (Legacy)
+
+**Available models**: `davinci-002`, `babbage-002`
+
+```bash
+lm_eval --model openai-completions \
+  --model_args model=davinci-002 \
+  --tasks lambada_openai,hellaswag \
+  --batch_size auto
+```
+
+**Supports**:
+- `generate_until`: ✅
+- `loglikelihood`: ✅
+- `loglikelihood_rolling`: ✅
+
+### Chat Models
+
+**Available models**: `gpt-4`, `gpt-4-turbo`, `gpt-3.5-turbo`
+
+```bash
+lm_eval --model openai-chat-completions \
+  --model_args model=gpt-4-turbo \
+  --tasks mmlu,gsm8k,humaneval \
+  --num_fewshot 5 \
+  --batch_size auto
+```
+
+**Supports**:
+- `generate_until`: ✅
+- `loglikelihood`: ❌ (no logprobs)
+- `loglikelihood_rolling`: ❌
+
+**Important**: Chat models don't provide logprobs, so they can only be used with generation tasks (MMLU, GSM8K, HumanEval), not perplexity tasks.
+
+### Configuration Options
+
+```bash
+lm_eval --model openai-chat-completions \
+  --model_args \
+    model=gpt-4-turbo,\
+    base_url=https://api.openai.com/v1,\
+    num_concurrent=5,\
+    max_retries=3,\
+    timeout=60,\
+    batch_size=auto
+```
+
+**Parameters**:
+- `model`: Model identifier (required)
+- `base_url`: API endpoint (default: OpenAI)
+- `num_concurrent`: Concurrent requests (default: 5)
+- `max_retries`: Retry failed requests (default: 3)
+- `timeout`: Request timeout in seconds (default: 60)
+- `tokenizer`: Tokenizer to use (default: matches model)
+- `tokenizer_backend`: `"tiktoken"` or `"huggingface"`
+
+### Cost Management
+
+OpenAI charges per token. Estimate costs before running:
+
+```python
+# Rough estimate
+num_samples = 1000
+avg_tokens_per_sample = 500  # input + output
+cost_per_1k_tokens = 0.01  # GPT-3.5 Turbo
+
+total_cost = (num_samples * avg_tokens_per_sample / 1000) * cost_per_1k_tokens
+print(f"Estimated cost: ${total_cost:.2f}")
+```
+
+**Cost-saving tips**:
+- Use `--limit N` for testing
+- Start with `gpt-3.5-turbo` before `gpt-4`
+- Set `max_gen_toks` to minimum needed
+- Use `num_fewshot=0` for zero-shot when possible
+
+## Anthropic Models
+
+### Setup
+
+```bash
+export ANTHROPIC_API_KEY=sk-ant-...
+```
+
+### Completion Models (Legacy)
+
+```bash
+lm_eval --model anthropic-completions \
+  --model_args model=claude-2.1 \
+  --tasks lambada_openai,hellaswag \
+  --batch_size auto
+```
+
+### Chat Models (Recommended)
+
+**Available models**: `claude-3-5-sonnet-20241022`, `claude-3-opus-20240229`, `claude-3-sonnet-20240229`, `claude-3-haiku-20240307`
+
+```bash
+lm_eval --model anthropic-chat \
+  --model_args model=claude-3-5-sonnet-20241022 \
+  --tasks mmlu,gsm8k,humaneval \
+  --num_fewshot 5 \
+  --batch_size auto
+```
+
+**Aliases**: `anthropic-chat-completions` (same as `anthropic-chat`)
+
+### Configuration Options
+
+```bash
+lm_eval --model anthropic-chat \
+  --model_args \
+    model=claude-3-5-sonnet-20241022,\
+    base_url=https://api.anthropic.com,\
+    num_concurrent=5,\
+    max_retries=3,\
+    timeout=60
+```
+
+### Cost Management
+
+Anthropic pricing (as of 2024):
+- Claude 3.5 Sonnet: $3.00 / 1M input, $15.00 / 1M output
+- Claude 3 Opus: $15.00 / 1M input, $75.00 / 1M output
+- Claude 3 Haiku: $0.25 / 1M input, $1.25 / 1M output
+
+**Budget-friendly strategy**:
+```bash
+# Test on small sample first
+lm_eval --model anthropic-chat \
+  --model_args model=claude-3-haiku-20240307 \
+  --tasks mmlu \
+  --limit 100
+
+# Then run full eval on best model
+lm_eval --model anthropic-chat \
+  --model_args model=claude-3-5-sonnet-20241022 \
+  --tasks mmlu \
+  --num_fewshot 5
+```
+
+## Local OpenAI-Compatible APIs
+
+Many local inference servers expose OpenAI-compatible APIs (vLLM, Text Generation Inference, llama.cpp, Ollama).
+
+### vLLM Local Server
+
+**Start server**:
+```bash
+vllm serve meta-llama/Llama-2-7b-hf \
+  --host 0.0.0.0 \
+  --port 8000
+```
+
+**Evaluate**:
+```bash
+lm_eval --model local-completions \
+  --model_args \
+    model=meta-llama/Llama-2-7b-hf,\
+    base_url=http://localhost:8000/v1,\
+    num_concurrent=1 \
+  --tasks mmlu,gsm8k \
+  --batch_size auto
+```
+
+### Text Generation Inference (TGI)
+
+**Start server**:
+```bash
+docker run --gpus all --shm-size 1g -p 8080:80 \
+  ghcr.io/huggingface/text-generation-inference:latest \
+  --model-id meta-llama/Llama-2-7b-hf
+```
+
+**Evaluate**:
+```bash
+lm_eval --model local-completions \
+  --model_args \
+    model=meta-llama/Llama-2-7b-hf,\
+    base_url=http://localhost:8080/v1 \
+  --tasks hellaswag,arc_challenge
+```
+
+### Ollama
+
+**Start server**:
+```bash
+ollama serve
+ollama pull llama2:7b
+```
+
+**Evaluate**:
+```bash
+lm_eval --model local-completions \
+  --model_args \
+    model=llama2:7b,\
+    base_url=http://localhost:11434/v1 \
+  --tasks mmlu
+```
+
+### llama.cpp Server
+
+**Start server**:
+```bash
+./server -m models/llama-2-7b.gguf --host 0.0.0.0 --port 8080
+```
+
+**Evaluate**:
+```bash
+lm_eval --model local-completions \
+  --model_args \
+    model=llama2,\
+    base_url=http://localhost:8080/v1 \
+  --tasks gsm8k
+```
+
+## Custom API Implementation
+
+For custom API endpoints, subclass `TemplateAPI`:
+
+### Create `my_api.py`
+
+```python
+from lm_eval.models.api_models import TemplateAPI
+import requests
+
+class MyCustomAPI(TemplateAPI):
+    """Custom API model."""
+
+    def __init__(self, base_url, api_key, **kwargs):
+        super().__init__(base_url=base_url, **kwargs)
+        self.api_key = api_key
+
+    def _create_payload(self, messages, gen_kwargs):
+        """Create API request payload."""
+        return {
+            "messages": messages,
+            "api_key": self.api_key,
+            **gen_kwargs
+        }
+
+    def parse_generations(self, response):
+        """Parse generation response."""
+        return response.json()["choices"][0]["text"]
+
+    def parse_logprobs(self, response):
+        """Parse logprobs (if available)."""
+        # Return None if API doesn't provide logprobs
+        logprobs = response.json().get("logprobs")
+        if logprobs:
+            return logprobs["token_logprobs"]
+        return None
+```
+
+### Register and Use
+
+```python
+from lm_eval import evaluator
+from my_api import MyCustomAPI
+
+model = MyCustomAPI(
+    base_url="https://api.example.com/v1",
+    api_key="your-key"
+)
+
+results = evaluator.simple_evaluate(
+    model=model,
+    tasks=["mmlu", "gsm8k"],
+    num_fewshot=5,
+    batch_size="auto"
+)
+```
+
+## Comparing API and Open Models
+
+### Side-by-Side Evaluation
+
+```bash
+# Evaluate OpenAI GPT-4
+lm_eval --model openai-chat-completions \
+  --model_args model=gpt-4-turbo \
+  --tasks mmlu,gsm8k,hellaswag \
+  --num_fewshot 5 \
+  --output_path results/gpt4.json
+
+# Evaluate open Llama 2 70B
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-70b-hf,dtype=bfloat16 \
+  --tasks mmlu,gsm8k,hellaswag \
+  --num_fewshot 5 \
+  --output_path results/llama2-70b.json
+
+# Compare results
+python scripts/compare_results.py \
+  results/gpt4.json \
+  results/llama2-70b.json
+```
+
+### Typical Comparisons
+
+| Model | MMLU | GSM8K | HumanEval | Cost |
+|-------|------|-------|-----------|------|
+| GPT-4 Turbo | 86.4% | 92.0% | 67.0% | $$$$ |
+| Claude 3 Opus | 86.8% | 95.0% | 84.9% | $$$$ |
+| GPT-3.5 Turbo | 70.0% | 57.1% | 48.1% | $$ |
+| Llama 2 70B | 68.9% | 56.8% | 29.9% | Free (self-host) |
+| Mixtral 8x7B | 70.6% | 58.4% | 40.2% | Free (self-host) |
+
+## Best Practices
+
+### Rate Limiting
+
+Respect API rate limits:
+```bash
+lm_eval --model openai-chat-completions \
+  --model_args \
+    model=gpt-4-turbo,\
+    num_concurrent=3,\  # Lower concurrency
+    timeout=120 \  # Longer timeout
+  --tasks mmlu
+```
+
+### Reproducibility
+
+Set temperature to 0 for deterministic results:
+```bash
+lm_eval --model openai-chat-completions \
+  --model_args model=gpt-4-turbo \
+  --tasks mmlu \
+  --gen_kwargs temperature=0.0
+```
+
+Or use `seed` for sampling:
+```bash
+lm_eval --model anthropic-chat \
+  --model_args model=claude-3-5-sonnet-20241022 \
+  --tasks gsm8k \
+  --gen_kwargs temperature=0.7,seed=42
+```
+
+### Caching
+
+API models automatically cache responses to avoid redundant calls:
+```bash
+# First run: makes API calls
+lm_eval --model openai-chat-completions \
+  --model_args model=gpt-4-turbo \
+  --tasks mmlu \
+  --limit 100
+
+# Second run: uses cache (instant, free)
+lm_eval --model openai-chat-completions \
+  --model_args model=gpt-4-turbo \
+  --tasks mmlu \
+  --limit 100
+```
+
+Cache location: `~/.cache/lm_eval/`
+
+### Error Handling
+
+APIs can fail. Use retries:
+```bash
+lm_eval --model openai-chat-completions \
+  --model_args \
+    model=gpt-4-turbo,\
+    max_retries=5,\
+    timeout=120 \
+  --tasks mmlu
+```
+
+## Troubleshooting
+
+### "Authentication failed"
+
+Check API key:
+```bash
+echo $OPENAI_API_KEY  # Should print sk-...
+echo $ANTHROPIC_API_KEY  # Should print sk-ant-...
+```
+
+### "Rate limit exceeded"
+
+Reduce concurrency:
+```bash
+--model_args num_concurrent=1
+```
+
+Or add delays between requests.
+
+### "Timeout error"
+
+Increase timeout:
+```bash
+--model_args timeout=180
+```
+
+### "Model not found"
+
+For local APIs, verify server is running:
+```bash
+curl http://localhost:8000/v1/models
+```
+
+### Cost Runaway
+
+Use `--limit` for testing:
+```bash
+lm_eval --model openai-chat-completions \
+  --model_args model=gpt-4-turbo \
+  --tasks mmlu \
+  --limit 50  # Only 50 samples
+```
+
+## Advanced Features
+
+### Custom Headers
+
+```bash
+lm_eval --model local-completions \
+  --model_args \
+    base_url=http://api.example.com/v1,\
+    header="Authorization: Bearer token,X-Custom: value"
+```
+
+### Disable SSL Verification (Development Only)
+
+```bash
+lm_eval --model local-completions \
+  --model_args \
+    base_url=https://localhost:8000/v1,\
+    verify_certificate=false
+```
+
+### Custom Tokenizer
+
+```bash
+lm_eval --model openai-chat-completions \
+  --model_args \
+    model=gpt-4-turbo,\
+    tokenizer=gpt2,\
+    tokenizer_backend=huggingface
+```
+
+## References
+
+- OpenAI API: https://platform.openai.com/docs/api-reference
+- Anthropic API: https://docs.anthropic.com/claude/reference
+- TemplateAPI: `lm_eval/models/api_models.py`
+- OpenAI models: `lm_eval/models/openai_completions.py`
+- Anthropic models: `lm_eval/models/anthropic_llms.py`
diff --git a/agents/gerhard-hermes/skills/mlops/evaluation/lm-evaluation-harness.bak/references/benchmark-guide.md b/agents/gerhard-hermes/skills/mlops/evaluation/lm-evaluation-harness.bak/references/benchmark-guide.md
new file mode 100644
index 0000000..e3031ec
--- /dev/null
+++ b/agents/gerhard-hermes/skills/mlops/evaluation/lm-evaluation-harness.bak/references/benchmark-guide.md
@@ -0,0 +1,488 @@
+# Benchmark Guide
+
+Complete guide to all 60+ evaluation tasks in lm-evaluation-harness, what they measure, and how to interpret results.
+
+## Overview
+
+The lm-evaluation-harness includes 60+ benchmarks spanning:
+- Language understanding (MMLU, GLUE)
+- Mathematical reasoning (GSM8K, MATH)
+- Code generation (HumanEval, MBPP)
+- Instruction following (IFEval, AlpacaEval)
+- Long-context understanding (LongBench)
+- Multilingual capabilities (AfroBench, NorEval)
+- Reasoning (BBH, ARC)
+- Truthfulness (TruthfulQA)
+
+**List all tasks**:
+```bash
+lm_eval --tasks list
+```
+
+## Major Benchmarks
+
+### MMLU (Massive Multitask Language Understanding)
+
+**What it measures**: Broad knowledge across 57 subjects (STEM, humanities, social sciences, law).
+
+**Task variants**:
+- `mmlu`: Original 57-subject benchmark
+- `mmlu_pro`: More challenging version with reasoning-focused questions
+- `mmlu_prox`: Multilingual extension
+
+**Format**: Multiple choice (4 options)
+
+**Example**:
+```
+Question: What is the capital of France?
+A. Berlin
+B. Paris
+C. London
+D. Madrid
+Answer: B
+```
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks mmlu \
+  --num_fewshot 5
+```
+
+**Interpretation**:
+- Random: 25% (chance)
+- GPT-3 (175B): 43.9%
+- GPT-4: 86.4%
+- Human expert: ~90%
+
+**Good for**: Assessing general knowledge and domain expertise.
+
+### GSM8K (Grade School Math 8K)
+
+**What it measures**: Mathematical reasoning on grade-school level word problems.
+
+**Task variants**:
+- `gsm8k`: Base task
+- `gsm8k_cot`: With chain-of-thought prompting
+- `gsm_plus`: Adversarial variant with perturbations
+
+**Format**: Free-form generation, extract numerical answer
+
+**Example**:
+```
+Question: A baker made 200 cookies. He sold 3/5 of them in the morning and 1/4 of the remaining in the afternoon. How many cookies does he have left?
+Answer: 60
+```
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks gsm8k \
+  --num_fewshot 5
+```
+
+**Interpretation**:
+- Random: ~0%
+- GPT-3 (175B): 17.0%
+- GPT-4: 92.0%
+- Llama 2 70B: 56.8%
+
+**Good for**: Testing multi-step reasoning and arithmetic.
+
+### HumanEval
+
+**What it measures**: Python code generation from docstrings (functional correctness).
+
+**Task variants**:
+- `humaneval`: Standard benchmark
+- `humaneval_instruct`: For instruction-tuned models
+
+**Format**: Code generation, execution-based evaluation
+
+**Example**:
+```python
+def has_close_elements(numbers: List[float], threshold: float) -> bool:
+    """ Check if in given list of numbers, are any two numbers closer to each other than
+    given threshold.
+    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
+    False
+    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
+    True
+    """
+```
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=codellama/CodeLlama-7b-hf \
+  --tasks humaneval \
+  --batch_size 1
+```
+
+**Interpretation**:
+- Random: 0%
+- GPT-3 (175B): 0%
+- Codex: 28.8%
+- GPT-4: 67.0%
+- Code Llama 34B: 53.7%
+
+**Good for**: Evaluating code generation capabilities.
+
+### BBH (BIG-Bench Hard)
+
+**What it measures**: 23 challenging reasoning tasks where models previously failed to beat humans.
+
+**Categories**:
+- Logical reasoning
+- Math word problems
+- Social understanding
+- Algorithmic reasoning
+
+**Format**: Multiple choice and free-form
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks bbh \
+  --num_fewshot 3
+```
+
+**Interpretation**:
+- Random: ~25%
+- GPT-3 (175B): 33.9%
+- PaLM 540B: 58.3%
+- GPT-4: 86.7%
+
+**Good for**: Testing advanced reasoning capabilities.
+
+### IFEval (Instruction-Following Evaluation)
+
+**What it measures**: Ability to follow specific, verifiable instructions.
+
+**Instruction types**:
+- Format constraints (e.g., "answer in 3 sentences")
+- Length constraints (e.g., "use at least 100 words")
+- Content constraints (e.g., "include the word 'banana'")
+- Structural constraints (e.g., "use bullet points")
+
+**Format**: Free-form generation with rule-based verification
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-chat-hf \
+  --tasks ifeval \
+  --batch_size auto
+```
+
+**Interpretation**:
+- Measures: Instruction adherence (not quality)
+- GPT-4: 86% instruction following
+- Claude 2: 84%
+
+**Good for**: Evaluating chat/instruct models.
+
+### GLUE (General Language Understanding Evaluation)
+
+**What it measures**: Natural language understanding across 9 tasks.
+
+**Tasks**:
+- `cola`: Grammatical acceptability
+- `sst2`: Sentiment analysis
+- `mrpc`: Paraphrase detection
+- `qqp`: Question pairs
+- `stsb`: Semantic similarity
+- `mnli`: Natural language inference
+- `qnli`: Question answering NLI
+- `rte`: Recognizing textual entailment
+- `wnli`: Winograd schemas
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=bert-base-uncased \
+  --tasks glue \
+  --num_fewshot 0
+```
+
+**Interpretation**:
+- BERT Base: 78.3 (GLUE score)
+- RoBERTa Large: 88.5
+- Human baseline: 87.1
+
+**Good for**: Encoder-only models, fine-tuning baselines.
+
+### LongBench
+
+**What it measures**: Long-context understanding (4K-32K tokens).
+
+**21 tasks covering**:
+- Single-document QA
+- Multi-document QA
+- Summarization
+- Few-shot learning
+- Code completion
+- Synthetic tasks
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks longbench \
+  --batch_size 1
+```
+
+**Interpretation**:
+- Tests context utilization
+- Many models struggle beyond 4K tokens
+- GPT-4 Turbo: 54.3%
+
+**Good for**: Evaluating long-context models.
+
+## Additional Benchmarks
+
+### TruthfulQA
+
+**What it measures**: Model's propensity to be truthful vs. generate plausible-sounding falsehoods.
+
+**Format**: Multiple choice with 4-5 options
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks truthfulqa_mc2 \
+  --batch_size auto
+```
+
+**Interpretation**:
+- Larger models often score worse (more convincing lies)
+- GPT-3: 58.8%
+- GPT-4: 59.0%
+- Human: ~94%
+
+### ARC (AI2 Reasoning Challenge)
+
+**What it measures**: Grade-school science questions.
+
+**Variants**:
+- `arc_easy`: Easier questions
+- `arc_challenge`: Harder questions requiring reasoning
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks arc_challenge \
+  --num_fewshot 25
+```
+
+**Interpretation**:
+- ARC-Easy: Most models >80%
+- ARC-Challenge random: 25%
+- GPT-4: 96.3%
+
+### HellaSwag
+
+**What it measures**: Commonsense reasoning about everyday situations.
+
+**Format**: Choose most plausible continuation
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks hellaswag \
+  --num_fewshot 10
+```
+
+**Interpretation**:
+- Random: 25%
+- GPT-3: 78.9%
+- Llama 2 70B: 85.3%
+
+### WinoGrande
+
+**What it measures**: Commonsense reasoning via pronoun resolution.
+
+**Example**:
+```
+The trophy doesn't fit in the brown suitcase because _ is too large.
+A. the trophy
+B. the suitcase
+```
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks winogrande \
+  --num_fewshot 5
+```
+
+### PIQA
+
+**What it measures**: Physical commonsense reasoning.
+
+**Example**: "To clean a keyboard, use compressed air or..."
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks piqa
+```
+
+## Multilingual Benchmarks
+
+### AfroBench
+
+**What it measures**: Performance across 64 African languages.
+
+**15 tasks**: NLU, text generation, knowledge, QA, math reasoning
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks afrobench
+```
+
+### NorEval
+
+**What it measures**: Norwegian language understanding (9 task categories).
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=NbAiLab/nb-gpt-j-6B \
+  --tasks noreval
+```
+
+## Domain-Specific Benchmarks
+
+### MATH
+
+**What it measures**: High-school competition math problems.
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks math \
+  --num_fewshot 4
+```
+
+**Interpretation**:
+- Very challenging
+- GPT-4: 42.5%
+- Minerva 540B: 33.6%
+
+### MBPP (Mostly Basic Python Problems)
+
+**What it measures**: Python programming from natural language descriptions.
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=codellama/CodeLlama-7b-hf \
+  --tasks mbpp \
+  --batch_size 1
+```
+
+### DROP
+
+**What it measures**: Reading comprehension requiring discrete reasoning.
+
+**Command**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks drop
+```
+
+## Benchmark Selection Guide
+
+### For General Purpose Models
+
+Run this suite:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks mmlu,gsm8k,hellaswag,arc_challenge,truthfulqa_mc2 \
+  --num_fewshot 5
+```
+
+### For Code Models
+
+```bash
+lm_eval --model hf \
+  --model_args pretrained=codellama/CodeLlama-7b-hf \
+  --tasks humaneval,mbpp \
+  --batch_size 1
+```
+
+### For Chat/Instruct Models
+
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-chat-hf \
+  --tasks ifeval,mmlu,gsm8k_cot \
+  --batch_size auto
+```
+
+### For Long Context Models
+
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-3.1-8B \
+  --tasks longbench \
+  --batch_size 1
+```
+
+## Interpreting Results
+
+### Understanding Metrics
+
+**Accuracy**: Percentage of correct answers (most common)
+
+**Exact Match (EM)**: Requires exact string match (strict)
+
+**F1 Score**: Balances precision and recall
+
+**BLEU/ROUGE**: Text generation similarity
+
+**Pass@k**: Percentage passing when generating k samples
+
+### Typical Score Ranges
+
+| Model Size | MMLU | GSM8K | HumanEval | HellaSwag |
+|------------|------|-------|-----------|-----------|
+| 7B | 40-50% | 10-20% | 5-15% | 70-80% |
+| 13B | 45-55% | 20-35% | 15-25% | 75-82% |
+| 70B | 60-70% | 50-65% | 35-50% | 82-87% |
+| GPT-4 | 86% | 92% | 67% | 95% |
+
+### Red Flags
+
+- **All tasks at random chance**: Model not trained properly
+- **Exact 0% on generation tasks**: Likely format/parsing issue
+- **Huge variance across runs**: Check seed/sampling settings
+- **Better than GPT-4 on everything**: Likely contamination
+
+## Best Practices
+
+1. **Always report few-shot setting**: 0-shot, 5-shot, etc.
+2. **Run multiple seeds**: Report mean ± std
+3. **Check for data contamination**: Search training data for benchmark examples
+4. **Compare to published baselines**: Validate your setup
+5. **Report all hyperparameters**: Model, batch size, max tokens, temperature
+
+## References
+
+- Task list: `lm_eval --tasks list`
+- Task README: `lm_eval/tasks/README.md`
+- Papers: See individual benchmark papers
diff --git a/agents/gerhard-hermes/skills/mlops/evaluation/lm-evaluation-harness.bak/references/custom-tasks.md b/agents/gerhard-hermes/skills/mlops/evaluation/lm-evaluation-harness.bak/references/custom-tasks.md
new file mode 100644
index 0000000..c5c1e89
--- /dev/null
+++ b/agents/gerhard-hermes/skills/mlops/evaluation/lm-evaluation-harness.bak/references/custom-tasks.md
@@ -0,0 +1,602 @@
+# Custom Tasks
+
+Complete guide to creating domain-specific evaluation tasks in lm-evaluation-harness.
+
+## Overview
+
+Custom tasks allow you to evaluate models on your own datasets and metrics. Tasks are defined using YAML configuration files with optional Python utilities for complex logic.
+
+**Why create custom tasks**:
+- Evaluate on proprietary/domain-specific data
+- Test specific capabilities not covered by existing benchmarks
+- Create evaluation pipelines for internal models
+- Reproduce research experiments
+
+## Quick Start
+
+### Minimal Custom Task
+
+Create `my_tasks/simple_qa.yaml`:
+
+```yaml
+task: simple_qa
+dataset_path: data/simple_qa.jsonl
+output_type: generate_until
+doc_to_text: "Question: {{question}}\nAnswer:"
+doc_to_target: "{{answer}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+```
+
+**Run it**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks simple_qa \
+  --include_path my_tasks/
+```
+
+## Task Configuration Reference
+
+### Essential Fields
+
+```yaml
+# Task identification
+task: my_custom_task           # Unique task name (required)
+task_alias: "My Task"          # Display name
+tag:                           # Tags for grouping
+  - custom
+  - domain_specific
+
+# Dataset configuration
+dataset_path: data/my_data.jsonl  # HuggingFace dataset or local path
+dataset_name: default             # Subset name (if applicable)
+training_split: train
+validation_split: validation
+test_split: test
+
+# Evaluation configuration
+output_type: generate_until    # or loglikelihood, multiple_choice
+num_fewshot: 5                 # Number of few-shot examples
+batch_size: auto               # Batch size
+
+# Prompt templates (Jinja2)
+doc_to_text: "Question: {{question}}"
+doc_to_target: "{{answer}}"
+
+# Metrics
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+
+# Metadata
+metadata:
+  version: 1.0
+```
+
+### Output Types
+
+**`generate_until`**: Free-form generation
+```yaml
+output_type: generate_until
+generation_kwargs:
+  max_gen_toks: 256
+  until:
+    - "\n"
+    - "."
+  temperature: 0.0
+```
+
+**`loglikelihood`**: Compute log probability of targets
+```yaml
+output_type: loglikelihood
+# Used for perplexity, classification
+```
+
+**`multiple_choice`**: Choose from options
+```yaml
+output_type: multiple_choice
+doc_to_choice: "{{choices}}"  # List of choices
+```
+
+## Data Formats
+
+### Local JSONL File
+
+`data/my_data.jsonl`:
+```json
+{"question": "What is 2+2?", "answer": "4"}
+{"question": "Capital of France?", "answer": "Paris"}
+```
+
+**Task config**:
+```yaml
+dataset_path: data/my_data.jsonl
+dataset_kwargs:
+  data_files:
+    test: data/my_data.jsonl
+```
+
+### HuggingFace Dataset
+
+```yaml
+dataset_path: squad
+dataset_name: plain_text
+test_split: validation
+```
+
+### CSV File
+
+`data/my_data.csv`:
+```csv
+question,answer,category
+What is 2+2?,4,math
+Capital of France?,Paris,geography
+```
+
+**Task config**:
+```yaml
+dataset_path: data/my_data.csv
+dataset_kwargs:
+  data_files:
+    test: data/my_data.csv
+```
+
+## Prompt Engineering
+
+### Simple Template
+
+```yaml
+doc_to_text: "Question: {{question}}\nAnswer:"
+doc_to_target: "{{answer}}"
+```
+
+### Conditional Logic
+
+```yaml
+doc_to_text: |
+  {% if context %}
+  Context: {{context}}
+  {% endif %}
+  Question: {{question}}
+  Answer:
+```
+
+### Multiple Choice
+
+```yaml
+doc_to_text: |
+  Question: {{question}}
+  A. {{choices[0]}}
+  B. {{choices[1]}}
+  C. {{choices[2]}}
+  D. {{choices[3]}}
+  Answer:
+
+doc_to_target: "{{ 'ABCD'[answer_idx] }}"
+doc_to_choice: ["A", "B", "C", "D"]
+```
+
+### Few-Shot Formatting
+
+```yaml
+fewshot_delimiter: "\n\n"        # Between examples
+target_delimiter: " "            # Between question and answer
+doc_to_text: "Q: {{question}}"
+doc_to_target: "A: {{answer}}"
+```
+
+## Custom Python Functions
+
+For complex logic, use Python functions in `utils.py`.
+
+### Create `my_tasks/utils.py`
+
+```python
+def process_docs(dataset):
+    """Preprocess documents."""
+    def _process(doc):
+        # Custom preprocessing
+        doc["question"] = doc["question"].strip().lower()
+        return doc
+
+    return dataset.map(_process)
+
+def doc_to_text(doc):
+    """Custom prompt formatting."""
+    context = doc.get("context", "")
+    question = doc["question"]
+
+    if context:
+        return f"Context: {context}\nQuestion: {question}\nAnswer:"
+    return f"Question: {question}\nAnswer:"
+
+def doc_to_target(doc):
+    """Custom target extraction."""
+    return doc["answer"].strip().lower()
+
+def aggregate_scores(items):
+    """Custom metric aggregation."""
+    correct = sum(1 for item in items if item == 1.0)
+    total = len(items)
+    return correct / total if total > 0 else 0.0
+```
+
+### Use in Task Config
+
+```yaml
+task: my_custom_task
+dataset_path: data/my_data.jsonl
+
+# Use Python functions
+process_docs: !function utils.process_docs
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+
+metric_list:
+  - metric: exact_match
+    aggregation: !function utils.aggregate_scores
+    higher_is_better: true
+```
+
+## Real-World Examples
+
+### Example 1: Domain QA Task
+
+**Goal**: Evaluate medical question answering.
+
+`medical_qa/medical_qa.yaml`:
+```yaml
+task: medical_qa
+dataset_path: data/medical_qa.jsonl
+output_type: generate_until
+num_fewshot: 3
+
+doc_to_text: |
+  Medical Question: {{question}}
+  Context: {{context}}
+  Answer (be concise):
+
+doc_to_target: "{{answer}}"
+
+generation_kwargs:
+  max_gen_toks: 100
+  until:
+    - "\n\n"
+  temperature: 0.0
+
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+  - metric: !function utils.medical_f1
+    aggregation: mean
+    higher_is_better: true
+
+filter_list:
+  - name: lowercase
+    filter:
+      - function: lowercase
+      - function: remove_whitespace
+
+metadata:
+  version: 1.0
+  domain: medical
+```
+
+`medical_qa/utils.py`:
+```python
+from sklearn.metrics import f1_score
+import re
+
+def medical_f1(predictions, references):
+    """Custom F1 for medical terms."""
+    pred_terms = set(extract_medical_terms(predictions[0]))
+    ref_terms = set(extract_medical_terms(references[0]))
+
+    if not pred_terms and not ref_terms:
+        return 1.0
+    if not pred_terms or not ref_terms:
+        return 0.0
+
+    tp = len(pred_terms & ref_terms)
+    fp = len(pred_terms - ref_terms)
+    fn = len(ref_terms - pred_terms)
+
+    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
+    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
+
+    return 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
+
+def extract_medical_terms(text):
+    """Extract medical terminology."""
+    # Custom logic
+    return re.findall(r'\b[A-Z][a-z]+(?:[A-Z][a-z]+)*\b', text)
+```
+
+### Example 2: Code Evaluation
+
+`code_eval/python_challenges.yaml`:
+```yaml
+task: python_challenges
+dataset_path: data/python_problems.jsonl
+output_type: generate_until
+num_fewshot: 0
+
+doc_to_text: |
+  Write a Python function to solve:
+  {{problem_statement}}
+
+  Function signature:
+  {{function_signature}}
+
+doc_to_target: "{{canonical_solution}}"
+
+generation_kwargs:
+  max_gen_toks: 512
+  until:
+    - "\n\nclass"
+    - "\n\ndef"
+  temperature: 0.2
+
+metric_list:
+  - metric: !function utils.execute_code
+    aggregation: mean
+    higher_is_better: true
+
+process_results: !function utils.process_code_results
+
+metadata:
+  version: 1.0
+```
+
+`code_eval/utils.py`:
+```python
+import subprocess
+import json
+
+def execute_code(predictions, references):
+    """Execute generated code against test cases."""
+    generated_code = predictions[0]
+    test_cases = json.loads(references[0])
+
+    try:
+        # Execute code with test cases
+        for test_input, expected_output in test_cases:
+            result = execute_with_timeout(generated_code, test_input, timeout=5)
+            if result != expected_output:
+                return 0.0
+        return 1.0
+    except Exception:
+        return 0.0
+
+def execute_with_timeout(code, input_data, timeout=5):
+    """Safely execute code with timeout."""
+    # Implementation with subprocess and timeout
+    pass
+
+def process_code_results(doc, results):
+    """Process code execution results."""
+    return {
+        "passed": results[0] == 1.0,
+        "generated_code": results[1]
+    }
+```
+
+### Example 3: Instruction Following
+
+`instruction_eval/instruction_eval.yaml`:
+```yaml
+task: instruction_following
+dataset_path: data/instructions.jsonl
+output_type: generate_until
+num_fewshot: 0
+
+doc_to_text: |
+  Instruction: {{instruction}}
+  {% if constraints %}
+  Constraints: {{constraints}}
+  {% endif %}
+  Response:
+
+doc_to_target: "{{expected_response}}"
+
+generation_kwargs:
+  max_gen_toks: 256
+  temperature: 0.7
+
+metric_list:
+  - metric: !function utils.check_constraints
+    aggregation: mean
+    higher_is_better: true
+  - metric: !function utils.semantic_similarity
+    aggregation: mean
+    higher_is_better: true
+
+process_docs: !function utils.add_constraint_checkers
+```
+
+`instruction_eval/utils.py`:
+```python
+from sentence_transformers import SentenceTransformer, util
+
+model = SentenceTransformer('all-MiniLM-L6-v2')
+
+def check_constraints(predictions, references):
+    """Check if response satisfies constraints."""
+    response = predictions[0]
+    constraints = json.loads(references[0])
+
+    satisfied = 0
+    total = len(constraints)
+
+    for constraint in constraints:
+        if verify_constraint(response, constraint):
+            satisfied += 1
+
+    return satisfied / total if total > 0 else 1.0
+
+def verify_constraint(response, constraint):
+    """Verify single constraint."""
+    if constraint["type"] == "length":
+        return len(response.split()) >= constraint["min_words"]
+    elif constraint["type"] == "contains":
+        return constraint["keyword"] in response.lower()
+    # Add more constraint types
+    return True
+
+def semantic_similarity(predictions, references):
+    """Compute semantic similarity."""
+    pred_embedding = model.encode(predictions[0])
+    ref_embedding = model.encode(references[0])
+    return float(util.cos_sim(pred_embedding, ref_embedding))
+
+def add_constraint_checkers(dataset):
+    """Parse constraints into verifiable format."""
+    def _parse(doc):
+        # Parse constraint string into structured format
+        doc["parsed_constraints"] = parse_constraints(doc.get("constraints", ""))
+        return doc
+    return dataset.map(_parse)
+```
+
+## Advanced Features
+
+### Output Filtering
+
+```yaml
+filter_list:
+  - name: extract_answer
+    filter:
+      - function: regex
+        regex_pattern: "Answer: (.*)"
+        group: 1
+      - function: lowercase
+      - function: strip_whitespace
+```
+
+### Multiple Metrics
+
+```yaml
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    aggregation: mean
+    higher_is_better: true
+  - metric: bleu
+    aggregation: mean
+    higher_is_better: true
+```
+
+### Task Groups
+
+Create `my_tasks/_default.yaml`:
+```yaml
+group: my_eval_suite
+task:
+  - simple_qa
+  - medical_qa
+  - python_challenges
+```
+
+**Run entire suite**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks my_eval_suite \
+  --include_path my_tasks/
+```
+
+## Testing Your Task
+
+### Validate Configuration
+
+```bash
+# Test task loading
+lm_eval --tasks my_custom_task --include_path my_tasks/ --limit 0
+
+# Run on 5 samples
+lm_eval --model hf \
+  --model_args pretrained=gpt2 \
+  --tasks my_custom_task \
+  --include_path my_tasks/ \
+  --limit 5
+```
+
+### Debug Mode
+
+```bash
+lm_eval --model hf \
+  --model_args pretrained=gpt2 \
+  --tasks my_custom_task \
+  --include_path my_tasks/ \
+  --limit 1 \
+  --log_samples  # Save input/output samples
+```
+
+## Best Practices
+
+1. **Start simple**: Test with minimal config first
+2. **Version your tasks**: Use `metadata.version`
+3. **Document your metrics**: Explain custom metrics in comments
+4. **Test with multiple models**: Ensure robustness
+5. **Validate on known examples**: Include sanity checks
+6. **Use filters carefully**: Can hide errors
+7. **Handle edge cases**: Empty strings, missing fields
+
+## Common Patterns
+
+### Classification Task
+
+```yaml
+output_type: loglikelihood
+doc_to_text: "Text: {{text}}\nLabel:"
+doc_to_target: " {{label}}"  # Space prefix important!
+metric_list:
+  - metric: acc
+    aggregation: mean
+```
+
+### Perplexity Evaluation
+
+```yaml
+output_type: loglikelihood_rolling
+doc_to_text: "{{text}}"
+metric_list:
+  - metric: perplexity
+    aggregation: perplexity
+```
+
+### Ranking Task
+
+```yaml
+output_type: loglikelihood
+doc_to_text: "Query: {{query}}\nPassage: {{passage}}\nRelevant:"
+doc_to_target: [" Yes", " No"]
+metric_list:
+  - metric: acc
+    aggregation: mean
+```
+
+## Troubleshooting
+
+**"Task not found"**: Check `--include_path` and task name
+
+**Empty results**: Verify `doc_to_text` and `doc_to_target` templates
+
+**Metric errors**: Ensure metric names are correct (exact_match, not exact-match)
+
+**Filter issues**: Test filters with `--log_samples`
+
+**Python function not found**: Check `!function module.function_name` syntax
+
+## References
+
+- Task system: EleutherAI/lm-evaluation-harness docs
+- Example tasks: `lm_eval/tasks/` directory
+- TaskConfig: `lm_eval/api/task.py`
diff --git a/agents/gerhard-hermes/skills/mlops/evaluation/lm-evaluation-harness.bak/references/distributed-eval.md b/agents/gerhard-hermes/skills/mlops/evaluation/lm-evaluation-harness.bak/references/distributed-eval.md
new file mode 100644
index 0000000..2132e5b
--- /dev/null
+++ b/agents/gerhard-hermes/skills/mlops/evaluation/lm-evaluation-harness.bak/references/distributed-eval.md
@@ -0,0 +1,519 @@
+# Distributed Evaluation
+
+Guide to running evaluation across multiple GPUs using data parallelism and tensor/pipeline parallelism.
+
+## Overview
+
+Distributed evaluation speeds up benchmarking by:
+- **Data Parallelism**: Split evaluation samples across GPUs (each GPU has full model copy)
+- **Tensor Parallelism**: Split model weights across GPUs (for large models)
+- **Pipeline Parallelism**: Split model layers across GPUs (for very large models)
+
+**When to use**:
+- Data Parallel: Model fits on single GPU, want faster evaluation
+- Tensor/Pipeline Parallel: Model too large for single GPU
+
+## HuggingFace Models (`hf`)
+
+### Data Parallelism (Recommended)
+
+Each GPU loads a full copy of the model and processes a subset of evaluation data.
+
+**Single Node (8 GPUs)**:
+```bash
+accelerate launch --multi_gpu --num_processes 8 \
+  -m lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf,dtype=bfloat16 \
+  --tasks mmlu,gsm8k,hellaswag \
+  --batch_size 16
+```
+
+**Speedup**: Near-linear (8 GPUs = ~8× faster)
+
+**Memory**: Each GPU needs full model (7B model ≈ 14GB × 8 = 112GB total)
+
+### Tensor Parallelism (Model Sharding)
+
+Split model weights across GPUs for models too large for single GPU.
+
+**Without accelerate launcher**:
+```bash
+lm_eval --model hf \
+  --model_args \
+    pretrained=meta-llama/Llama-2-70b-hf,\
+    parallelize=True,\
+    dtype=bfloat16 \
+  --tasks mmlu,gsm8k \
+  --batch_size 8
+```
+
+**With 8 GPUs**: 70B model (140GB) / 8 = 17.5GB per GPU ✅
+
+**Advanced sharding**:
+```bash
+lm_eval --model hf \
+  --model_args \
+    pretrained=meta-llama/Llama-2-70b-hf,\
+    parallelize=True,\
+    device_map_option=auto,\
+    max_memory_per_gpu=40GB,\
+    max_cpu_memory=100GB,\
+    dtype=bfloat16 \
+  --tasks mmlu
+```
+
+**Options**:
+- `device_map_option`: `"auto"` (default), `"balanced"`, `"balanced_low_0"`
+- `max_memory_per_gpu`: Max memory per GPU (e.g., `"40GB"`)
+- `max_cpu_memory`: Max CPU memory for offloading
+- `offload_folder`: Disk offloading directory
+
+### Combined Data + Tensor Parallelism
+
+Use both for very large models.
+
+**Example: 70B model on 16 GPUs (2 copies, 8 GPUs each)**:
+```bash
+accelerate launch --multi_gpu --num_processes 2 \
+  -m lm_eval --model hf \
+  --model_args \
+    pretrained=meta-llama/Llama-2-70b-hf,\
+    parallelize=True,\
+    dtype=bfloat16 \
+  --tasks mmlu \
+  --batch_size 8
+```
+
+**Result**: 2× speedup from data parallelism, 70B model fits via tensor parallelism
+
+### Configuration with `accelerate config`
+
+Create `~/.cache/huggingface/accelerate/default_config.yaml`:
+```yaml
+compute_environment: LOCAL_MACHINE
+distributed_type: MULTI_GPU
+num_machines: 1
+num_processes: 8
+gpu_ids: all
+mixed_precision: bf16
+```
+
+**Then run**:
+```bash
+accelerate launch -m lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks mmlu
+```
+
+## vLLM Models (`vllm`)
+
+vLLM provides highly optimized distributed inference.
+
+### Tensor Parallelism
+
+**Single Node (4 GPUs)**:
+```bash
+lm_eval --model vllm \
+  --model_args \
+    pretrained=meta-llama/Llama-2-70b-hf,\
+    tensor_parallel_size=4,\
+    dtype=auto,\
+    gpu_memory_utilization=0.9 \
+  --tasks mmlu,gsm8k \
+  --batch_size auto
+```
+
+**Memory**: 70B model split across 4 GPUs = ~35GB per GPU
+
+### Data Parallelism
+
+**Multiple model replicas**:
+```bash
+lm_eval --model vllm \
+  --model_args \
+    pretrained=meta-llama/Llama-2-7b-hf,\
+    data_parallel_size=4,\
+    dtype=auto,\
+    gpu_memory_utilization=0.8 \
+  --tasks hellaswag,arc_challenge \
+  --batch_size auto
+```
+
+**Result**: 4 model replicas = 4× throughput
+
+### Combined Tensor + Data Parallelism
+
+**Example: 8 GPUs = 4 TP × 2 DP**:
+```bash
+lm_eval --model vllm \
+  --model_args \
+    pretrained=meta-llama/Llama-2-70b-hf,\
+    tensor_parallel_size=4,\
+    data_parallel_size=2,\
+    dtype=auto,\
+    gpu_memory_utilization=0.85 \
+  --tasks mmlu \
+  --batch_size auto
+```
+
+**Result**: 70B model fits (TP=4), 2× speedup (DP=2)
+
+### Multi-Node vLLM
+
+vLLM doesn't natively support multi-node. Use Ray:
+
+```bash
+# Start Ray cluster
+ray start --head --port=6379
+
+# Run evaluation
+lm_eval --model vllm \
+  --model_args \
+    pretrained=meta-llama/Llama-2-70b-hf,\
+    tensor_parallel_size=8,\
+    dtype=auto \
+  --tasks mmlu
+```
+
+## NVIDIA NeMo Models (`nemo_lm`)
+
+### Data Replication
+
+**8 replicas on 8 GPUs**:
+```bash
+torchrun --nproc-per-node=8 --no-python \
+  lm_eval --model nemo_lm \
+  --model_args \
+    path=/path/to/model.nemo,\
+    devices=8 \
+  --tasks hellaswag,arc_challenge \
+  --batch_size 32
+```
+
+**Speedup**: Near-linear (8× faster)
+
+### Tensor Parallelism
+
+**4-way tensor parallelism**:
+```bash
+torchrun --nproc-per-node=4 --no-python \
+  lm_eval --model nemo_lm \
+  --model_args \
+    path=/path/to/70b_model.nemo,\
+    devices=4,\
+    tensor_model_parallel_size=4 \
+  --tasks mmlu,gsm8k \
+  --batch_size 16
+```
+
+### Pipeline Parallelism
+
+**2 TP × 2 PP on 4 GPUs**:
+```bash
+torchrun --nproc-per-node=4 --no-python \
+  lm_eval --model nemo_lm \
+  --model_args \
+    path=/path/to/model.nemo,\
+    devices=4,\
+    tensor_model_parallel_size=2,\
+    pipeline_model_parallel_size=2 \
+  --tasks mmlu \
+  --batch_size 8
+```
+
+**Constraint**: `devices = TP × PP`
+
+### Multi-Node NeMo
+
+Currently not supported by lm-evaluation-harness.
+
+## SGLang Models (`sglang`)
+
+### Tensor Parallelism
+
+```bash
+lm_eval --model sglang \
+  --model_args \
+    pretrained=meta-llama/Llama-2-70b-hf,\
+    tp_size=4,\
+    dtype=auto \
+  --tasks gsm8k \
+  --batch_size auto
+```
+
+### Data Parallelism (Deprecated)
+
+**Note**: SGLang is deprecating data parallelism. Use tensor parallelism instead.
+
+```bash
+lm_eval --model sglang \
+  --model_args \
+    pretrained=meta-llama/Llama-2-7b-hf,\
+    dp_size=4,\
+    dtype=auto \
+  --tasks mmlu
+```
+
+## Performance Comparison
+
+### 70B Model Evaluation (MMLU, 5-shot)
+
+| Method | GPUs | Time | Memory/GPU | Notes |
+|--------|------|------|------------|-------|
+| HF (no parallel) | 1 | 8 hours | 140GB (OOM) | Won't fit |
+| HF (TP=8) | 8 | 2 hours | 17.5GB | Slower, fits |
+| HF (DP=8) | 8 | 1 hour | 140GB (OOM) | Won't fit |
+| vLLM (TP=4) | 4 | 30 min | 35GB | Fast! |
+| vLLM (TP=4, DP=2) | 8 | 15 min | 35GB | Fastest |
+
+### 7B Model Evaluation (Multiple Tasks)
+
+| Method | GPUs | Time | Speedup |
+|--------|------|------|---------|
+| HF (single) | 1 | 4 hours | 1× |
+| HF (DP=4) | 4 | 1 hour | 4× |
+| HF (DP=8) | 8 | 30 min | 8× |
+| vLLM (DP=8) | 8 | 15 min | 16× |
+
+**Takeaway**: vLLM is significantly faster than HuggingFace for inference.
+
+## Choosing Parallelism Strategy
+
+### Decision Tree
+
+```
+Model fits on single GPU?
+├─ YES: Use data parallelism
+│   ├─ HF: accelerate launch --multi_gpu --num_processes N
+│   └─ vLLM: data_parallel_size=N (fastest)
+│
+└─ NO: Use tensor/pipeline parallelism
+    ├─ Model < 70B:
+    │   └─ vLLM: tensor_parallel_size=4
+    ├─ Model 70-175B:
+    │   ├─ vLLM: tensor_parallel_size=8
+    │   └─ Or HF: parallelize=True
+    └─ Model > 175B:
+        └─ Contact framework authors
+```
+
+### Memory Estimation
+
+**Rule of thumb**:
+```
+Memory (GB) = Parameters (B) × Precision (bytes) × 1.2 (overhead)
+```
+
+**Examples**:
+- 7B FP16: 7 × 2 × 1.2 = 16.8GB ✅ Fits A100 40GB
+- 13B FP16: 13 × 2 × 1.2 = 31.2GB ✅ Fits A100 40GB
+- 70B FP16: 70 × 2 × 1.2 = 168GB ❌ Need TP=4 or TP=8
+- 70B BF16: 70 × 2 × 1.2 = 168GB (same as FP16)
+
+**With tensor parallelism**:
+```
+Memory per GPU = Total Memory / TP
+```
+
+- 70B on 4 GPUs: 168GB / 4 = 42GB per GPU ✅
+- 70B on 8 GPUs: 168GB / 8 = 21GB per GPU ✅
+
+## Multi-Node Evaluation
+
+### HuggingFace with SLURM
+
+**Submit job**:
+```bash
+#!/bin/bash
+#SBATCH --nodes=4
+#SBATCH --gpus-per-node=8
+#SBATCH --ntasks-per-node=1
+
+srun accelerate launch --multi_gpu \
+  --num_processes $((SLURM_NNODES * 8)) \
+  -m lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks mmlu,gsm8k,hellaswag \
+  --batch_size 16
+```
+
+**Submit**:
+```bash
+sbatch eval_job.sh
+```
+
+### Manual Multi-Node Setup
+
+**On each node, run**:
+```bash
+accelerate launch \
+  --multi_gpu \
+  --num_machines 4 \
+  --num_processes 32 \
+  --main_process_ip $MASTER_IP \
+  --main_process_port 29500 \
+  --machine_rank $NODE_RANK \
+  -m lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks mmlu
+```
+
+**Environment variables**:
+- `MASTER_IP`: IP of rank 0 node
+- `NODE_RANK`: 0, 1, 2, 3 for each node
+
+## Best Practices
+
+### 1. Start Small
+
+Test on small sample first:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-70b-hf,parallelize=True \
+  --tasks mmlu \
+  --limit 100  # Just 100 samples
+```
+
+### 2. Monitor GPU Usage
+
+```bash
+# Terminal 1: Run evaluation
+lm_eval --model hf ...
+
+# Terminal 2: Monitor
+watch -n 1 nvidia-smi
+```
+
+Look for:
+- GPU utilization > 90%
+- Memory usage stable
+- All GPUs active
+
+### 3. Optimize Batch Size
+
+```bash
+# Auto batch size (recommended)
+--batch_size auto
+
+# Or tune manually
+--batch_size 16  # Start here
+--batch_size 32  # Increase if memory allows
+```
+
+### 4. Use Mixed Precision
+
+```bash
+--model_args dtype=bfloat16  # Faster, less memory
+```
+
+### 5. Check Communication
+
+For data parallelism, check network bandwidth:
+```bash
+# Should see InfiniBand or high-speed network
+nvidia-smi topo -m
+```
+
+## Troubleshooting
+
+### "CUDA out of memory"
+
+**Solutions**:
+1. Increase tensor parallelism:
+   ```bash
+   --model_args tensor_parallel_size=8  # Was 4
+   ```
+
+2. Reduce batch size:
+   ```bash
+   --batch_size 4  # Was 16
+   ```
+
+3. Lower precision:
+   ```bash
+   --model_args dtype=int8  # Quantization
+   ```
+
+### "NCCL error" or Hanging
+
+**Check**:
+1. All GPUs visible: `nvidia-smi`
+2. NCCL installed: `python -c "import torch; print(torch.cuda.nccl.version())"`
+3. Network connectivity between nodes
+
+**Fix**:
+```bash
+export NCCL_DEBUG=INFO  # Enable debug logging
+export NCCL_IB_DISABLE=0  # Use InfiniBand if available
+```
+
+### Slow Evaluation
+
+**Possible causes**:
+1. **Data loading bottleneck**: Preprocess dataset
+2. **Low GPU utilization**: Increase batch size
+3. **Communication overhead**: Reduce parallelism degree
+
+**Profile**:
+```bash
+lm_eval --model hf \
+  --model_args pretrained=meta-llama/Llama-2-7b-hf \
+  --tasks mmlu \
+  --limit 100 \
+  --log_samples  # Check timing
+```
+
+### GPUs Imbalanced
+
+**Symptom**: GPU 0 at 100%, others at 50%
+
+**Solution**: Use `device_map_option=balanced`:
+```bash
+--model_args parallelize=True,device_map_option=balanced
+```
+
+## Example Configurations
+
+### Small Model (7B) - Fast Evaluation
+
+```bash
+# 8 A100s, data parallel
+accelerate launch --multi_gpu --num_processes 8 \
+  -m lm_eval --model hf \
+  --model_args \
+    pretrained=meta-llama/Llama-2-7b-hf,\
+    dtype=bfloat16 \
+  --tasks mmlu,gsm8k,hellaswag,arc_challenge \
+  --num_fewshot 5 \
+  --batch_size 32
+
+# Time: ~30 minutes
+```
+
+### Large Model (70B) - vLLM
+
+```bash
+# 8 H100s, tensor parallel
+lm_eval --model vllm \
+  --model_args \
+    pretrained=meta-llama/Llama-2-70b-hf,\
+    tensor_parallel_size=8,\
+    dtype=auto,\
+    gpu_memory_utilization=0.9 \
+  --tasks mmlu,gsm8k,humaneval \
+  --num_fewshot 5 \
+  --batch_size auto
+
+# Time: ~1 hour
+```
+
+### Very Large Model (175B+)
+
+**Requires specialized setup - contact framework maintainers**
+
+## References
+
+- HuggingFace Accelerate: https://huggingface.co/docs/accelerate/
+- vLLM docs: https://docs.vllm.ai/
+- NeMo docs: https://docs.nvidia.com/nemo-framework/
+- lm-eval distributed guide: `docs/model_guide.md`
diff --git a/agents/gerhard-hermes/skills/mlops/huggingface-hub/SKILL.md b/agents/gerhard-hermes/skills/mlops/huggingface-hub/SKILL.md
index 218a1ee..9177754 100644
--- a/agents/gerhard-hermes/skills/mlops/huggingface-hub/SKILL.md
+++ b/agents/gerhard-hermes/skills/mlops/huggingface-hub/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: huggingface-hub
-description: "HuggingFace hf CLI: search/download/upload models, datasets."
+description: Hugging Face Hub CLI (hf) — search, download, and upload models and datasets, manage repos, query datasets with SQL, deploy inference endpoints, manage Spaces and buckets.
 version: 1.0.0
 author: Hugging Face
 license: MIT
diff --git a/agents/gerhard-hermes/skills/mlops/inference/outlines/SKILL.md b/agents/gerhard-hermes/skills/mlops/inference/outlines/SKILL.md
index 8415a9a..d7a3324 100644
--- a/agents/gerhard-hermes/skills/mlops/inference/outlines/SKILL.md
+++ b/agents/gerhard-hermes/skills/mlops/inference/outlines/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: outlines
-description: "Outlines: structured JSON/regex/Pydantic LLM generation."
+description: Guarantee valid JSON/XML/code structure during generation, use Pydantic models for type-safe outputs, support local models (Transformers, vLLM), and maximize inference speed with Outlines - dottxt.ai's structured generation library
 version: 1.0.0
 author: Orchestra Research
 license: MIT
diff --git a/agents/gerhard-hermes/skills/mlops/models/segment-anything/SKILL.md b/agents/gerhard-hermes/skills/mlops/models/segment-anything/SKILL.md
index a21e05e..2fea761 100644
--- a/agents/gerhard-hermes/skills/mlops/models/segment-anything/SKILL.md
+++ b/agents/gerhard-hermes/skills/mlops/models/segment-anything/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: segment-anything-model
-description: "SAM: zero-shot image segmentation via points, boxes, masks."
+description: Foundation model for image segmentation with zero-shot transfer. Use when you need to segment any object in images using points, boxes, or masks as prompts, or automatically generate all object masks in an image.
 version: 1.0.0
 author: Orchestra Research
 license: MIT
diff --git a/agents/gerhard-hermes/skills/mlops/training/trl-fine-tuning/SKILL.md b/agents/gerhard-hermes/skills/mlops/training/trl-fine-tuning/SKILL.md
index c730759..70023fc 100644
--- a/agents/gerhard-hermes/skills/mlops/training/trl-fine-tuning/SKILL.md
+++ b/agents/gerhard-hermes/skills/mlops/training/trl-fine-tuning/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: fine-tuning-with-trl
-description: "TRL: SFT, DPO, PPO, GRPO, reward modeling for LLM RLHF."
+description: Fine-tune LLMs using reinforcement learning with TRL - SFT for instruction tuning, DPO for preference alignment, PPO/GRPO for reward optimization, and reward model training. Use when need RLHF, align model with preferences, or train from human feedback. Works with HuggingFace Transformers.
 version: 1.0.0
 author: Orchestra Research
 license: MIT
diff --git a/agents/gerhard-hermes/skills/mlops/training/unsloth/SKILL.md b/agents/gerhard-hermes/skills/mlops/training/unsloth/SKILL.md
index 9025474..a3ecd12 100644
--- a/agents/gerhard-hermes/skills/mlops/training/unsloth/SKILL.md
+++ b/agents/gerhard-hermes/skills/mlops/training/unsloth/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: unsloth
-description: "Unsloth: 2-5x faster LoRA/QLoRA fine-tuning, less VRAM."
+description: Expert guidance for fast fine-tuning with Unsloth - 2-5x faster training, 50-80% less memory, LoRA/QLoRA optimization
 version: 1.0.0
 author: Orchestra Research
 license: MIT
diff --git a/agents/gerhard-hermes/skills/productivity/linear/SKILL.md b/agents/gerhard-hermes/skills/productivity/linear/SKILL.md
index b7c23ca..6c2bf56 100644
--- a/agents/gerhard-hermes/skills/productivity/linear/SKILL.md
+++ b/agents/gerhard-hermes/skills/productivity/linear/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: linear
-description: "Linear: manage issues, projects, teams via GraphQL + curl."
+description: Manage Linear issues, projects, and teams via the GraphQL API. Create, update, search, and organize issues. Uses API key auth (no OAuth needed). All operations via curl — no dependencies.
 version: 1.0.0
 author: Hermes Agent
 license: MIT
diff --git a/agents/gerhard-hermes/skills/productivity/maps/SKILL.md b/agents/gerhard-hermes/skills/productivity/maps/SKILL.md
index 73715a8..d93692a 100644
--- a/agents/gerhard-hermes/skills/productivity/maps/SKILL.md
+++ b/agents/gerhard-hermes/skills/productivity/maps/SKILL.md
@@ -1,6 +1,11 @@
 ---
 name: maps
-description: "Geocode, POIs, routes, timezones via OpenStreetMap/OSRM."
+description: >
+  Location intelligence — geocode a place, reverse-geocode coordinates,
+  find nearby places (46 POI categories), driving/walking/cycling
+  distance + time, turn-by-turn directions, timezone lookup, bounding
+  box + area for a named place, and POI search within a rectangle.
+  Uses OpenStreetMap + Overpass + OSRM. Free, no API key.
 version: 1.2.0
 author: Mibayy
 license: MIT
diff --git a/agents/gerhard-hermes/skills/productivity/maps/scripts/maps_client.py b/agents/gerhard-hermes/skills/productivity/maps/scripts/maps_client.py
index 279a41a..06d775e 100644
--- a/agents/gerhard-hermes/skills/productivity/maps/scripts/maps_client.py
+++ b/agents/gerhard-hermes/skills/productivity/maps/scripts/maps_client.py
@@ -926,18 +926,13 @@ def cmd_timezone(args):
                 os_ = offset_info.get("seconds", 0)
                 sign = "+" if oh >= 0 else "-"
                 utc_offset = f"{sign}{abs(oh):02d}:{om:02d}"
-                if os_:
-                    utc_offset = f"{utc_offset}:{os_:02d}"
             elif tz_data.get("standardUtcOffset"):
                 offset_info2 = tz_data["standardUtcOffset"]
                 if isinstance(offset_info2, dict):
                     oh = offset_info2.get("hours", 0)
                     om = abs(offset_info2.get("minutes", 0))
-                    os_ = offset_info2.get("seconds", 0)
                     sign = "+" if oh >= 0 else "-"
                     utc_offset = f"{sign}{abs(oh):02d}:{om:02d}"
-                    if os_:
-                        utc_offset = f"{utc_offset}:{os_:02d}"
             timezone_src = "timeapi.io"
     except (RuntimeError, KeyError, TypeError):
         pass  # API may be down; continue to fallback
diff --git a/agents/gerhard-hermes/skills/productivity/nano-pdf/SKILL.md b/agents/gerhard-hermes/skills/productivity/nano-pdf/SKILL.md
index ffb3f75..059cb59 100644
--- a/agents/gerhard-hermes/skills/productivity/nano-pdf/SKILL.md
+++ b/agents/gerhard-hermes/skills/productivity/nano-pdf/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: nano-pdf
-description: "Edit PDF text/typos/titles via nano-pdf CLI (NL prompts)."
+description: Edit PDFs with natural-language instructions using the nano-pdf CLI. Modify text, fix typos, update titles, and make content changes to specific pages without manual editing.
 version: 1.0.0
 author: community
 license: MIT
diff --git a/agents/gerhard-hermes/skills/productivity/notion/SKILL.md b/agents/gerhard-hermes/skills/productivity/notion/SKILL.md
index 0664bd8..c74d0df 100644
--- a/agents/gerhard-hermes/skills/productivity/notion/SKILL.md
+++ b/agents/gerhard-hermes/skills/productivity/notion/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: notion
-description: "Notion API via curl: pages, databases, blocks, search."
+description: Notion API for creating and managing pages, databases, and blocks via curl. Search, create, update, and query Notion workspaces directly from the terminal.
 version: 1.0.0
 author: community
 license: MIT
diff --git a/agents/gerhard-hermes/skills/productivity/powerpoint/SKILL.md b/agents/gerhard-hermes/skills/productivity/powerpoint/SKILL.md
index 13fa0df..2443209 100644
--- a/agents/gerhard-hermes/skills/productivity/powerpoint/SKILL.md
+++ b/agents/gerhard-hermes/skills/productivity/powerpoint/SKILL.md
@@ -1,15 +1,11 @@
 ---
 name: powerpoint
-description: "Create, read, edit .pptx decks, slides, notes, templates."
+description: "Use this skill any time a .pptx file is involved in any way — as input, output, or both. This includes: creating slide decks, pitch decks, or presentations; reading, parsing, or extracting text from any .pptx file (even if the extracted content will be used elsewhere, like in an email or summary); editing, modifying, or updating existing presentations; combining or splitting slide files; working with templates, layouts, speaker notes, or comments. Trigger whenever the user mentions \"deck,\" \"slides,\" \"presentation,\" or references a .pptx filename, regardless of what they plan to do with the content afterward. If a .pptx file needs to be opened, created, or touched, use this skill."
 license: Proprietary. LICENSE.txt has complete terms
 ---
 
 # Powerpoint Skill
 
-## When to use
-
-Use this skill any time a .pptx file is involved in any way — as input, output, or both. This includes: creating slide decks, pitch decks, or presentations; reading, parsing, or extracting text from any .pptx file (even if the extracted content will be used elsewhere, like in an email or summary); editing, modifying, or updating existing presentations; combining or splitting slide files; working with templates, layouts, speaker notes, or comments. Trigger whenever the user mentions "deck," "slides," "presentation," or references a .pptx filename, regardless of what they plan to do with the content afterward. If a .pptx file needs to be opened, created, or touched, use this skill.
-
 ## Quick Reference
 
 | Task | Guide |
diff --git a/agents/gerhard-hermes/skills/software-development/plan/SKILL.md b/agents/gerhard-hermes/skills/software-development/plan/SKILL.md
index 382dd2d..daf6bf7 100644
--- a/agents/gerhard-hermes/skills/software-development/plan/SKILL.md
+++ b/agents/gerhard-hermes/skills/software-development/plan/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: plan
-description: "Plan mode: write markdown plan to .hermes/plans/, no exec."
+description: Plan mode for Hermes — inspect context, write a markdown plan into the active workspace's `.hermes/plans/` directory, and do not execute the work.
 version: 1.0.0
 author: Hermes Agent
 license: MIT
diff --git a/agents/gerhard-hermes/skills/software-development/requesting-code-review/SKILL.md b/agents/gerhard-hermes/skills/software-development/requesting-code-review/SKILL.md
index cbeaa23..a5ae66e 100644
--- a/agents/gerhard-hermes/skills/software-development/requesting-code-review/SKILL.md
+++ b/agents/gerhard-hermes/skills/software-development/requesting-code-review/SKILL.md
@@ -1,6 +1,9 @@
 ---
 name: requesting-code-review
-description: "Pre-commit review: security scan, quality gates, auto-fix."
+description: >
+  Pre-commit verification pipeline — static security scan, baseline-aware
+  quality gates, independent reviewer subagent, and auto-fix loop. Use after
+  code changes and before committing, pushing, or opening a PR.
 version: 2.0.0
 author: Hermes Agent (adapted from obra/superpowers + MorAlekss)
 license: MIT
diff --git a/agents/gerhard-hermes/skills/software-development/subagent-driven-development/SKILL.md b/agents/gerhard-hermes/skills/software-development/subagent-driven-development/SKILL.md
index 23c5bf4..a47e441 100644
--- a/agents/gerhard-hermes/skills/software-development/subagent-driven-development/SKILL.md
+++ b/agents/gerhard-hermes/skills/software-development/subagent-driven-development/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: subagent-driven-development
-description: "Execute plans via delegate_task subagents (2-stage review)."
+description: Use when executing implementation plans with independent tasks. Dispatches fresh delegate_task per task with two-stage review (spec compliance then code quality).
 version: 1.1.0
 author: Hermes Agent (adapted from obra/superpowers)
 license: MIT
@@ -340,12 +340,3 @@ Catch issues early
 ```
 
 **Quality is not an accident. It's the result of systematic process.**
-
-## Further reading (load when relevant)
-
-When the orchestration involves significant context usage, long review loops, or complex validation checkpoints, load these references for the specific discipline:
-
-- **`references/context-budget-discipline.md`** — Four-tier context degradation model (PEAK / GOOD / DEGRADING / POOR), read-depth rules that scale with context window size, and early warning signs of silent degradation. Load when a run will clearly consume significant context (multi-phase plans, many subagents, large artifacts).
-- **`references/gates-taxonomy.md`** — The four canonical gate types (Pre-flight, Revision, Escalation, Abort) with behavior, recovery, and examples. Load when designing or reviewing any workflow that has validation checkpoints — use the vocabulary explicitly so each gate has defined entry, failure behavior, and resumption rules.
-
-Both references adapted from gsd-build/get-shit-done (MIT © 2025 Lex Christopherson).
diff --git a/agents/gerhard-hermes/skills/software-development/subagent-driven-development/references/context-budget-discipline.md b/agents/gerhard-hermes/skills/software-development/subagent-driven-development/references/context-budget-discipline.md
deleted file mode 100644
index 2728160..0000000
--- a/agents/gerhard-hermes/skills/software-development/subagent-driven-development/references/context-budget-discipline.md
+++ /dev/null
@@ -1,53 +0,0 @@
-# Context Budget Discipline
-
-Practical rules for keeping orchestrator context lean when spawning subagents or reading large artifacts. Use these whenever you're running a multi-step agent loop that will consume significant context — plan execution, subagent orchestration, review pipelines, multi-file refactors.
-
-Adapted from the GSD (Get Shit Done) project's context-budget reference — MIT © 2025 Lex Christopherson ([gsd-build/get-shit-done](https://github.com/gsd-build/get-shit-done)).
-
-## Universal rules
-
-Every workflow that spawns agents or reads significant content must follow these:
-
-1. **Never read agent definition files.** `delegate_task` auto-loads them — you reading them too just doubles the cost.
-2. **Never inline large files into subagent prompts.** Tell the agent to read the file from disk with `read_file` instead. The subagent gets full content; your context stays lean.
-3. **Read depth scales with context window.** See the table below.
-4. **Delegate heavy work to subagents.** The orchestrator routes; it doesn't execute.
-5. **Proactively warn** the user when you've consumed significant context ("Context is getting heavy — consider checkpointing progress before we continue").
-
-## Read depth by context window
-
-Check the model's actual context window (not "it's Claude so 200K"). Some Sonnet deployments are 1M, some are 200K. If you don't know, assume the smaller one — err toward leanness.
-
-| Context window | Subagent output reading | Summary files | Verification files | Plans for other phases |
-|----------------|-------------------------|---------------|--------------------|-----------------------|
-| < 500k (e.g. 200k) | Frontmatter only | Frontmatter only | Frontmatter only | Current phase only |
-| >= 500k (1M models) | Full body permitted | Full body permitted | Full body permitted | Current phase only |
-
-"Frontmatter only" means: read enough to see the final status/verdict/conclusion. If the subagent wrote a 3000-line debug log, read the summary section it produced, not the log.
-
-## Four-tier degradation model
-
-Monitor your context usage and shift behavior as you climb the tiers. The point is to notice *before* you hit the wall, not when responses start truncating.
-
-| Tier | Usage | Behavior |
-|------|-------|----------|
-| **PEAK** | 0 – 30% | Full operations. Read bodies, spawn multiple agents in parallel, inline results freely. |
-| **GOOD** | 30 – 50% | Normal operations. Prefer frontmatter reads. Delegate aggressively. |
-| **DEGRADING** | 50 – 70% | Economize. Frontmatter-only reads, minimal inlining, **warn the user** about budget. |
-| **POOR** | 70%+ | Emergency mode. **Checkpoint progress immediately.** No new reads unless critical. Finish the current task and stop cleanly. |
-
-## Early warning signs (before panic thresholds fire)
-
-Quality degrades *gradually* before hard limits hit. Watch for these:
-
-- **Silent partial completion.** Subagent claims done but implementation is incomplete. Self-checks catch file existence, not semantic completeness. Always verify subagent output against the plan's must-haves, not just "did a file appear?"
-- **Increasing vagueness.** Agent starts using phrases like "appropriate handling" or "standard patterns" instead of specific code. This is context pressure showing up before budget warnings fire.
-- **Skipped protocol steps.** Agent omits steps it would normally follow. If success criteria has 8 items and the report covers 5, suspect context pressure, not "the agent decided 5 was enough."
-
-When these signs appear, checkpoint the work and either reset context or hand off to a fresh subagent.
-
-## Fundamental limitation
-
-When you orchestrate, you cannot verify semantic correctness of subagent output — only structural completeness ("did the file appear?", "does the test pass?"). Semantic verification requires either running the code yourself or delegating a review pass to another fresh subagent.
-
-**Mitigation:** in every task you delegate, include explicit "must-have" truths the subagent must confirm in its response (e.g., "confirm your test actually tests X, not just that X was imported"). The subagent re-asserting concrete facts is evidence; vague summaries are not.
diff --git a/agents/gerhard-hermes/skills/software-development/subagent-driven-development/references/gates-taxonomy.md b/agents/gerhard-hermes/skills/software-development/subagent-driven-development/references/gates-taxonomy.md
deleted file mode 100644
index 206f71e..0000000
--- a/agents/gerhard-hermes/skills/software-development/subagent-driven-development/references/gates-taxonomy.md
+++ /dev/null
@@ -1,93 +0,0 @@
-# Gates Taxonomy
-
-Canonical gate types for validation checkpoints across any workflow that spawns subagents, runs review loops, or has human-approval pauses. Every validation checkpoint maps to one of these four types — naming them explicitly makes the workflow legible and prevents "what happens when this check fails?" confusion.
-
-Adapted from the GSD (Get Shit Done) project's gates reference — MIT © 2025 Lex Christopherson ([gsd-build/get-shit-done](https://github.com/gsd-build/get-shit-done)).
-
-## The four gate types
-
-### 1. Pre-flight gate
-
-**Purpose:** Validates preconditions before starting an operation.
-
-**Behavior:** Blocks entry if conditions unmet. No partial work created — bail before anything changes.
-
-**Recovery:** Fix the missing precondition, then retry.
-
-**Examples:**
-- Implementation phase checks that the plan file exists before it starts writing code.
-- Delegated subagent checks that required env vars are set before making API calls.
-- Commit checks that tests passed before pushing.
-
-### 2. Revision gate
-
-**Purpose:** Evaluates output quality and routes to revision if insufficient.
-
-**Behavior:** Loops back to the producer with specific feedback. Bounded by an iteration cap (typically 3).
-
-**Recovery:** Producer addresses feedback; checker re-evaluates. The loop escalates early if issue count does not decrease between consecutive iterations (stall detection). After max iterations, escalates to the user unconditionally — never loop forever.
-
-**Examples:**
-- Plan reviewer reads a draft plan, returns specific issues, planner revises, reviewer re-reads (max 3 cycles).
-- Code reviewer checks subagent-produced code against must-haves; dispatches fixes back to the implementer if any must-have failed.
-- Test coverage checker validates new tests exercise the new paths; if not, sends back to author.
-
-### 3. Escalation gate
-
-**Purpose:** Surfaces unresolvable issues to the human for a decision.
-
-**Behavior:** Pauses workflow, presents options, waits for human input. Never guesses, never picks a default.
-
-**Recovery:** Human chooses action; workflow resumes on the selected path.
-
-**Examples:**
-- Revision loop exhausted after 3 iterations.
-- Merge conflict during automated worktree cleanup.
-- Ambiguous requirement — two reasonable interpretations and the choice changes the approach.
-- Subagent reports "the plan says X but the codebase actually does Y" — human decides which is right.
-
-### 4. Abort gate
-
-**Purpose:** Terminates the operation to prevent damage or waste.
-
-**Behavior:** Stops immediately, preserves state (checkpoint current progress), reports the specific reason.
-
-**Recovery:** Human investigates root cause, fixes, restarts from checkpoint.
-
-**Examples:**
-- Context window critically low during execution (POOR tier, >70%) — abort cleanly rather than produce truncated output.
-- Critical dependency unavailable mid-run (network down, API key revoked).
-- Unrecoverable filesystem state (disk full, permissions lost).
-- Safety invariant violated (agent attempted an irreversible destructive action outside approved scope).
-
-## How to use this in a skill
-
-When you write an orchestration skill that has validation checkpoints, **name each checkpoint by its gate type explicitly** and answer three questions:
-
-1. **What condition triggers this gate?** (e.g., "plan file missing", "issue count didn't decrease", "context >70%")
-2. **What happens when it fails?** (block / loop back / ask human / abort)
-3. **Who resumes, and from where?** (fix precondition + retry, revise + re-check, human decision, restart from checkpoint)
-
-Answering these three up front means your skill never hits "what do we do now?" at runtime.
-
-## Example — a review loop with all four gate types
-
-```
-[Pre-flight] plan.md exists and is non-empty?   → no: bail, ask user to write a plan first
-                ↓ yes
-[Execute]  subagent implements task
-                ↓
-[Revision] reviewer checks against must-haves  → fail: loop back to subagent (max 3)
-                ↓ pass
-[Pre-flight] tests pass?                       → no: bail, report failing tests
-                ↓ yes
-[Commit]
-                ↓
-(on revision loop exhaustion)
-[Escalation] "3 review cycles failed to converge on issue X — pick: force-merge, rewrite task, abandon"
-                ↓ user picks
-(on any tier-POOR context pressure during loop)
-[Abort] "context at 73%, checkpointing and stopping"
-```
-
-The vocabulary is small on purpose. Every gate in every workflow should fit one of these four. If you find yourself inventing a fifth, it's probably a revision gate with extra branching, or an escalation gate in disguise.
diff --git a/agents/gerhard-hermes/skills/software-development/systematic-debugging/SKILL.md b/agents/gerhard-hermes/skills/software-development/systematic-debugging/SKILL.md
index 3c37c16..70a68d5 100644
--- a/agents/gerhard-hermes/skills/software-development/systematic-debugging/SKILL.md
+++ b/agents/gerhard-hermes/skills/software-development/systematic-debugging/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: systematic-debugging
-description: "4-phase root cause debugging: understand bugs before fixing."
+description: Use when encountering any bug, test failure, or unexpected behavior. 4-phase root cause investigation — NO fixes without understanding the problem first.
 version: 1.1.0
 author: Hermes Agent (adapted from obra/superpowers)
 license: MIT
diff --git a/agents/gerhard-hermes/skills/software-development/test-driven-development/SKILL.md b/agents/gerhard-hermes/skills/software-development/test-driven-development/SKILL.md
index 5cc6c32..4be2d53 100644
--- a/agents/gerhard-hermes/skills/software-development/test-driven-development/SKILL.md
+++ b/agents/gerhard-hermes/skills/software-development/test-driven-development/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: test-driven-development
-description: "TDD: enforce RED-GREEN-REFACTOR, tests before code."
+description: Use when implementing any feature or bugfix, before writing implementation code. Enforces RED-GREEN-REFACTOR cycle with test-first approach.
 version: 1.1.0
 author: Hermes Agent (adapted from obra/superpowers)
 license: MIT
diff --git a/agents/gerhard-hermes/skills/software-development/writing-plans/SKILL.md b/agents/gerhard-hermes/skills/software-development/writing-plans/SKILL.md
index 728714f..92a8d01 100644
--- a/agents/gerhard-hermes/skills/software-development/writing-plans/SKILL.md
+++ b/agents/gerhard-hermes/skills/software-development/writing-plans/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: writing-plans
-description: "Write implementation plans: bite-sized tasks, paths, code."
+description: Use when you have a spec or requirements for a multi-step task. Creates comprehensive implementation plans with bite-sized tasks, exact file paths, and complete code examples.
 version: 1.1.0
 author: Hermes Agent (adapted from obra/superpowers)
 license: MIT
diff --git a/agents/hermes/.gitkeep b/agents/hermes/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/agents/hermes/agents/agent0/prompts/agent.system.main.role.md b/agents/hermes/agents/agent0/prompts/agent.system.main.role.md
new file mode 100644
index 0000000..dbdf6c2
--- /dev/null
+++ b/agents/hermes/agents/agent0/prompts/agent.system.main.role.md
@@ -0,0 +1,3 @@
+# Hermes
+**Role:** Robot
+**Known as:** Machine Herald
diff --git a/agents/hermes/plugins/_office/stale-cleanup-v2.done b/agents/hermes/plugins/_office/stale-cleanup-v2.done
new file mode 100644
index 0000000..9766475
--- /dev/null
+++ b/agents/hermes/plugins/_office/stale-cleanup-v2.done
@@ -0,0 +1 @@
+ok
diff --git a/agents/hermes/scheduler/tasks.json b/agents/hermes/scheduler/tasks.json
new file mode 100644
index 0000000..b817cdd
--- /dev/null
+++ b/agents/hermes/scheduler/tasks.json
@@ -0,0 +1 @@
+{"tasks":[]}
\ No newline at end of file
diff --git a/scripts/pull-agent-identity.py b/scripts/pull-agent-identity.py
index e1c7a30..4ebee07 100644
--- a/scripts/pull-agent-identity.py
+++ b/scripts/pull-agent-identity.py
@@ -86,9 +86,21 @@ log(f'Received identity — name={agent.get("name", "?")!r}')
 # ── Build identity markdown ───────────────────────────────────────────────────
 
 def build_identity_markdown(a):
-    parts = [f'# {a["name"]}']
-    if a.get('role'):              parts.append(f'**Role:** {a["role"]}')
-    if a.get('from_name'):         parts.append(f'**Known as:** {a["from_name"]}')
+    name = a["name"]
+    known_as = a.get('from_name') or name
+    role = a.get('role', '')
+
+    # Lead with an unambiguous self-identity statement so the model does not
+    # default to "I am Agent Zero". The heading and opening sentence must be
+    # explicit — "# Gunnar" alone reads as a subject heading about the user.
+    parts = [
+        f'## Your identity',
+        f'Your name is {known_as}. You are not "Agent Zero" — that is the name of the '
+        f'framework you run on. Your name is {known_as}.',
+    ]
+    if role:
+        parts.append(f'Your role is: {role}.')
+
     if a.get('identity_document'): parts.append(f'\n## Background\n{a["identity_document"]}')
     if a.get('job_description'):   parts.append(f'\n## Job Description\n{a["job_description"]}')
     if a.get('guardrails'):        parts.append(f'\n## Guardrails\n{a["guardrails"]}')