Merge remote-tracking branch 'refs/remotes/origin/main'

This commit is contained in:
2026-05-09 20:09:41 +02:00
131 changed files with 2259 additions and 2134 deletions
+9 -4
View File
@@ -1,5 +1,13 @@
{
<<<<<<< HEAD
"updated_at": "2026-05-09T16:45:03.491152",
=======
<<<<<<< HEAD
"updated_at": "2026-05-09T17:16:32.631120",
=======
"updated_at": "2026-05-09T15:50:02.252303",
>>>>>>> 4179f93276e789a4ddacdd3480d14b69c5944497
>>>>>>> refs/remotes/origin/main
"platforms": {
"telegram": [],
"discord": [],
@@ -17,9 +25,6 @@
"wecom_callback": [],
"weixin": [],
"bluebubbles": [],
"qqbot": [],
"yuanbao": [],
"irc": [],
"teams": []
"qqbot": []
}
}
+5 -1
View File
@@ -1 +1,5 @@
{"pid":7,"kind":"hermes-gateway","argv":["/opt/hermes/.venv/bin/hermes","gateway","run"],"start_time":3028669,"gateway_state":"draining","exit_reason":null,"restart_requested":false,"active_agents":0,"platforms":{},"updated_at":"2026-05-09T18:04:27.274373+00:00"}
<<<<<<< HEAD
{"pid":7,"kind":"hermes-gateway","argv":["/opt/hermes/.venv/bin/hermes","gateway","run"],"start_time":3028669,"gateway_state":"draining","exit_reason":null,"restart_requested":false,"active_agents":0,"platforms":{},"updated_at":"2026-05-09T18:04:27.274373+00:00"}
=======
{"pid": 7, "kind": "hermes-gateway", "argv": ["/opt/hermes/.venv/bin/hermes", "gateway", "run"], "start_time": 97270363, "gateway_state": "running", "exit_reason": null, "restart_requested": false, "active_agents": 0, "platforms": {}, "updated_at": "2026-05-09T17:16:32.582158+00:00"}
>>>>>>> refs/remotes/origin/main
+25 -40
View File
@@ -1,89 +1,74 @@
airtable:dec8bcab05383e0ca8ae0e3c241d3a48
apple-notes:5e448abf984561fb33b197045ce41388
apple-reminders:cda2963c73800643faf4a34ef813879a
architecture-diagram:8ed67034726b0ac3639d9c009d166222
apple-notes:16ffca134c5590714781d8aeef51f8f3
apple-reminders:0273a9a17f6d07c55c84735c4366186b
architecture-diagram:999ab6d4445dbd407a82031857aa9791
arxiv:0ad5eb32727a1cb2bbff9e1e8e4dbff7
ascii-art:6eed9eb0c7cedf2bccd3cb7b7c91271c
ascii-video:ab08372213418d643c81445fe759c28e
ascii-video:93697173a0a33f7ecb7c4dc1c27f80e8
audiocraft-audio-generation:41d06b6ec94d1cdb3d864efe452780fd
axolotl:710b8e88805a85efc461dcd70c937cae
baoyu-comic:0be1250d5433538d71a4ab6d81b359dc
baoyu-infographic:567069c2548a69eafcbce09c028438dd
baoyu-infographic:d00f808010611c77d3fe00f58d2d7176
blogwatcher:d0b55ef6acff9ad26f1febace610ca3b
claude-code:88bbb9f0e26f8148141da379e4e837c5
claude-design:6607092a7d19705b9647067a09afd733
codebase-inspection:97bf36f290117abc11ffde72535713e2
codebase-inspection:5b1f99e926f347fe7c8c2658c9cc15b9
codex:79bb6b5d9b47453cd0d7ac25df5a3c97
comfyui:d6f42584ff328d6aa6a4b2e8e678c030
debugging-hermes-tui-commands:f992bee7976a1d0f59884fa57e58f314
design-md:a09844075e6e856a4a256dbc5f9e899a
dogfood:77ff237be7db22a4ef3850b411d915ed
design-md:267d0d8c363c9809744d1c62d561805e
dogfood:fc03244c3237e6b7325dc8aef387f2e3
dspy:5e0770e2563d11d9d4cc040681277c1c
evaluating-llms-harness:784cd66354b654dedf7541cd9b9e4c91
excalidraw:1679ad1d31a591fa3cb636d9150adcc7
findmy:1d7dd3ae39cf25357a374c6bfb956442
fine-tuning-with-trl:f73c765998375978e9fe529cafa6054a
findmy:bd50940d7b0104f6d6bf8981fc54b827
fine-tuning-with-trl:b2f0948b0f6e7202a452d9569bbd8f64
gif-search:dc9206e5c5c2d648774864df5222c95f
github-auth:6afa4cccb1eacad83dcdae2930b818a9
github-code-review:41071b74c0222d4e784de8f0927f757d
github-issues:3e4d98c7a6b1ebd0a55c752abb7a612b
github-pr-workflow:834e9cd72f18ea4598934d8d253b5858
github-repo-management:8479a9fb418f8dcfbbb191caaeccaa37
github-auth:909ef9bbff492b214a625179f704c09a
github-code-review:e56793f8efef112bbcdad96f69b45ddd
github-issues:ecb864a88aeea8f88f5b8742fec8806b
github-pr-workflow:cab1d57b84e253dddff37bd212f469ca
github-repo-management:7d7131b113d4dc2509a47501a6638e76
godmode:c592b460bf06e1f31b51bc6ac299e111
google-workspace:cf9028aff358f6c6b6ebc183672ad947
heartmula:ce53b2e6c9d68238cae5ae727738ecde
hermes-agent:286e1312a50b53f11b9714f506989e4f
hermes-agent-skill-authoring:d5b8b704b92d44ffa1e44f8b3d795037
hermes-agent:1c55510fc8a7a8c0fee3134866ca5dc2
himalaya:9da608734d1af8dab132406492bd5828
huggingface-hub:c02809f64f3a534ad1970e094474f04f
humanizer:0a006757e41d605ba0818ecca10288ed
huggingface-hub:14002a449cb5f9a5ff8bdc7f730bcb2f
ideation:0d1719daa364f2c5badd40c94620360f
imessage:f545da0f5cc64dd9ee1ffd2b7733a11b
jupyter-live-kernel:54612d9f0ff1b5eb6564f2dfeb5102b7
kanban-orchestrator:1636b60c79180ee89108727bff9383c7
kanban-worker:bc9124639762b2a5c20cd85580ae92e4
jupyter-live-kernel:6bda9690d8c71095ac738bd9825e32f2
linear:ab7a5dbd4001e31e2bd888d86ab699f8
llama-cpp:fcfa4c23d52ac84abccf0b38e9844e07
llm-wiki:9cb710c49d1af6fdba54d06a835a5498
manim-video:86ba8c24fdd57771d68bea812d3b2466
maps:5c8bb0a45921760a9c8f598ebfe7631e
maps:285f3436aafadf452fac8c0bb5715e40
minecraft-modpack-server:3cc682f8aef5f86d3580601ba28f9ba3
nano-pdf:dd55aca10b8e2844a0cda3c68c757e83
native-mcp:5564a9d31ce4165b532c575a315ddca4
node-inspect-debugger:e8f38e8586a090b880edcdbcba67ec76
notion:e24ae292897a6ca7837867864bc82c3c
native-mcp:a8644a4f45c8403c1ad3342230b5c154
notion:ac54a68c490d4cf1604bc24160083d43
obliteratus:98dfcbfcad4416d27d5dcbd0a491d772
obsidian:1dde562f384c6dc5eaec0b7c214caab4
ocr-and-documents:0fe461668b245d894370b8b40c3eb012
opencode:e3583bfa72da47385f6466eaf226faef
openhue:0487b4695a071cc62da64c79935bc8d1
outlines:ac034ba450bf3d0d05eb736dddcd117f
p5js:5879c824a5487d6553d9380e37aa9c5e
outlines:8efbd31f1252f6c8fb340db4d9dcce2f
p5js:80de285f6ef54c19c22e4eafd1877fe4
pixel-art:f94fe511926a222052ec8d2dc892b112
plan:6a014103919a9b11d60e2d6267055871
pokemon-player:2a30ed51c1179b22967fb4a33e6e57e4
polymarket:b4a7d758f2fb29efb290dce1094cc625
popular-web-designs:a77ef442dcf747d8d534f5acb6b6f0cf
powerpoint:6ae6326c8fc5ff5a67b8e5283437ec30
pretext:1a72b0c0b65188ce43917cac6d5b8973
python-debugpy:d40cd39a90885e2c5ac7be13bbf5e832
requesting-code-review:f76de34aee69387c297cf982c85fd6fe
requesting-code-review:f9cc90df11a9ce1cc23595c574eacd75
research-paper-writing:e1fa7bb71e73fbc74ea017720f971e9a
segment-anything-model:a2403c1bf179c28cbac2ba7d56357b69
serving-llms-vllm:a8b5453a5316da8df055a0f23c3cbd25
sketch:56b3e77b9ff82d38fe1c7b8c6067de5d
songsee:7738e32bff3ca9ec32b37b32e0a8c9ca
songwriting-and-ai-music:65b4a6757901021ca16d9c8ecab62f7c
spike:a1034fab3d8669745ee75474dd9c3a6b
spotify:af733b32166f235fe3e0026e213ff2d4
subagent-driven-development:3d4c3f5060b7e1577fc3306b9ca36ffd
systematic-debugging:a02cf3ccd7b79909137ac1af46d01ed6
test-driven-development:32bc0784dc0720a9e536ba1ce559fedf
touchdesigner-mcp:3a428984eb83905c5ae89d0abf0ef866
unsloth:6482bcde01d0a9aeaddc247932c3c69c
webhook-subscriptions:edce3200566edfa7259718b51b8f52f3
weights-and-biases:91fd048a0b693f6d74a4639ea08bbd1d
writing-plans:c91061baf59682c9b10a317b5ff25617
writing-plans:5b72a4318524fd7ffb37fd43e51e3954
xurl:97a1749bd7274b93c631d71d2cf92e52
youtube-content:c448e213097433492d51a063d34eb9ae
yuanbao:69fa2e9e8b534a633443d47262e86855
@@ -1,6 +1,6 @@
---
name: apple-notes
description: "Manage Apple Notes via memo CLI: create, search, edit."
description: Manage Apple Notes via the memo CLI on macOS (create, view, search, edit).
version: 1.0.0
author: Hermes Agent
license: MIT
@@ -1,6 +1,6 @@
---
name: apple-reminders
description: "Apple Reminders via remindctl: add, list, complete."
description: Manage Apple Reminders via remindctl CLI (list, add, complete, delete).
version: 1.0.0
author: Hermes Agent
license: MIT
@@ -1,6 +1,6 @@
---
name: findmy
description: "Track Apple devices/AirTags via FindMy.app on macOS."
description: Track Apple devices and AirTags via FindMy.app on macOS using AppleScript and screen capture.
version: 1.0.0
author: Hermes Agent
license: MIT
@@ -1,6 +1,6 @@
---
name: claude-code
description: "Delegate coding to Claude Code CLI (features, PRs)."
description: Delegate coding tasks to Claude Code (Anthropic's CLI agent). Use for building features, refactoring, PR reviews, and iterative coding. Requires the claude CLI installed.
version: 2.2.0
author: Hermes Agent + Teknium
license: MIT
@@ -1,6 +1,6 @@
---
name: hermes-agent
description: "Configure, extend, or contribute to Hermes Agent."
description: Complete guide to using and extending Hermes Agent — CLI usage, setup, configuration, spawning additional agents, gateway platforms, skills, voice, tools, profiles, and a concise contributor reference. Load this skill when helping users configure Hermes, troubleshoot issues, spawn agent instances, or make code contributions.
version: 2.0.0
author: Hermes Agent + Teknium
license: MIT
@@ -115,7 +115,7 @@ hermes tools disable NAME Disable a toolset
hermes skills list List installed skills
hermes skills search QUERY Search the skills hub
hermes skills install ID Install a skill (ID can be a hub identifier OR a direct https://…/SKILL.md URL; pass --name to override when frontmatter has no name)
hermes skills install ID Install a skill
hermes skills inspect ID Preview without installing
hermes skills config Enable/disable skills per platform
hermes skills check Check for updates
@@ -281,6 +281,7 @@ Type these during an interactive chat session.
### Utility
```
/branch (/fork) Branch the current session
/btw Ephemeral side question (doesn't interrupt main task)
/fast Toggle priority/fast processing
/browser Open CDP browser connection
/history Show conversation history (CLI)
@@ -402,63 +403,6 @@ Tool changes take effect on `/reset` (new session). They do NOT apply mid-conver
---
## Security & Privacy Toggles
Common "why is Hermes doing X to my output / tool calls / commands?" toggles — and the exact commands to change them. Most of these need a fresh session (`/reset` in chat, or start a new `hermes` invocation) because they're read once at startup.
### Secret redaction in tool output
Secret redaction is **off by default** — tool output (terminal stdout, `read_file`, web content, subagent summaries, etc.) passes through unmodified. If the user wants Hermes to auto-mask strings that look like API keys, tokens, and secrets before they enter the conversation context and logs:
```bash
hermes config set security.redact_secrets true # enable globally
```
**Restart required.** `security.redact_secrets` is snapshotted at import time — toggling it mid-session (e.g. via `export HERMES_REDACT_SECRETS=true` from a tool call) will NOT take effect for the running process. Tell the user to run `hermes config set security.redact_secrets true` in a terminal, then start a new session. This is deliberate — it prevents an LLM from flipping the toggle on itself mid-task.
Disable again with:
```bash
hermes config set security.redact_secrets false
```
### PII redaction in gateway messages
Separate from secret redaction. When enabled, the gateway hashes user IDs and strips phone numbers from the session context before it reaches the model:
```bash
hermes config set privacy.redact_pii true # enable
hermes config set privacy.redact_pii false # disable (default)
```
### Command approval prompts
By default (`approvals.mode: manual`), Hermes prompts the user before running shell commands flagged as destructive (`rm -rf`, `git reset --hard`, etc.). The modes are:
- `manual` — always prompt (default)
- `smart` — use an auxiliary LLM to auto-approve low-risk commands, prompt on high-risk
- `off` — skip all approval prompts (equivalent to `--yolo`)
```bash
hermes config set approvals.mode smart # recommended middle ground
hermes config set approvals.mode off # bypass everything (not recommended)
```
Per-invocation bypass without changing config:
- `hermes --yolo …`
- `export HERMES_YOLO_MODE=1`
Note: YOLO / `approvals.mode: off` does NOT turn off secret redaction. They are independent.
### Shell hooks allowlist
Some shell-hook integrations require explicit allowlisting before they fire. Managed via `~/.hermes/shell-hooks-allowlist.json` — prompted interactively the first time a hook wants to run.
### Disabling the web/browser/image-gen tools
To keep the model away from network or media tools entirely, open `hermes tools` and toggle per-platform. Takes effect on next session (`/reset`). See the Tools & Skills section above.
---
## Voice & Transcription
### STT (Voice → Text)
@@ -1,6 +1,6 @@
---
name: architecture-diagram
description: "Dark-themed SVG architecture/cloud/infra diagrams as HTML."
description: Generate dark-themed SVG diagrams of software systems and cloud infrastructure as standalone HTML files with inline SVG graphics. Semantic component colors (cyan=frontend, emerald=backend, violet=database, amber=cloud/AWS, rose=security, orange=message bus), JetBrains Mono font, grid background. Best suited for software architecture, cloud/VPC topology, microservice maps, service-mesh diagrams, database + API layer diagrams, security groups, message buses — anything that fits a tech-infra deck with a dark aesthetic. If a more specialized diagramming skill exists for the subject (scientific, educational, hand-drawn, animated, etc.), prefer that — otherwise this skill can also serve as a general-purpose SVG diagram fallback. Based on Cocoon AI's architecture-diagram-generator (MIT).
version: 1.0.0
author: Cocoon AI (hello@cocoon-ai.com), ported by Hermes Agent
license: MIT
@@ -1,6 +1,6 @@
---
name: ascii-art
description: "ASCII art: pyfiglet, cowsay, boxes, image-to-ascii."
description: Generate ASCII art using pyfiglet (571 fonts), cowsay, boxes, toilet, image-to-ascii, remote APIs (asciified, ascii.co.uk), and LLM fallback. No API keys required.
version: 4.0.0
author: 0xbyt4, Hermes Agent
license: MIT
@@ -1,18 +1,10 @@
---
name: ascii-video
description: "ASCII video: convert video/audio to colored ASCII MP4/GIF."
description: "Production pipeline for ASCII art video — any format. Converts video/audio/images/generative input into colored ASCII character video output (MP4, GIF, image sequence). Covers: video-to-ASCII conversion, audio-reactive music visualizers, generative ASCII art animations, hybrid video+audio reactive, text/lyrics overlays, real-time terminal rendering. Use when users request: ASCII video, text art video, terminal-style video, character art animation, retro text visualization, audio visualizer in ASCII, converting video to ASCII art, matrix-style effects, or any animated ASCII output."
---
# ASCII Video Production Pipeline
## When to use
Use when users request: ASCII video, text art video, terminal-style video, character art animation, retro text visualization, audio visualizer in ASCII, converting video to ASCII art, matrix-style effects, or any animated ASCII output.
## What's inside
Production pipeline for ASCII art video — any format. Converts video/audio/images/generative input into colored ASCII character video output (MP4, GIF, image sequence). Covers: video-to-ASCII conversion, audio-reactive music visualizers, generative ASCII art animations, hybrid video+audio reactive, text/lyrics overlays, real-time terminal rendering.
## Creative Standard
This is visual art. ASCII characters are the medium; cinema is the standard.
@@ -1,6 +1,6 @@
---
name: baoyu-comic
description: "Knowledge comics (知识漫画): educational, biography, tutorial."
description: Knowledge comic creator supporting multiple art styles and tones. Creates original educational comics with detailed panel layouts and sequential image generation. Use when user asks to create "知识漫画", "教育漫画", "biography comic", "tutorial comic", or "Logicomix-style comic".
version: 1.56.1
author: 宝玉 (JimLiu)
license: MIT
@@ -1,6 +1,6 @@
---
name: baoyu-infographic
description: "Infographics: 21 layouts x 21 styles (信息图, 可视化)."
description: Generate professional infographics with 21 layout types and 21 visual styles. Analyzes content, recommends layout×style combinations, and generates publication-ready infographics. Use when user asks to create "infographic", "visual summary", "信息图", "可视化", or "高密度信息大图".
version: 1.56.1
author: 宝玉 (JimLiu)
license: MIT
@@ -1,7 +1,7 @@
---
name: ideation
title: Creative Ideation — Constraint-Driven Project Generation
description: "Generate project ideas via creative constraints."
description: "Generate project ideas through creative constraints. Use when the user says 'I want to build something', 'give me a project idea', 'I'm bored', 'what should I make', 'inspire me', or any variant of 'I have tools but no direction'. Works for code, art, hardware, writing, tools, and anything that can be made."
version: 1.0.0
author: SHL0MS
license: MIT
@@ -14,10 +14,6 @@ metadata:
# Creative Ideation
## When to use
Use when the user says 'I want to build something', 'give me a project idea', 'I'm bored', 'what should I make', 'inspire me', or any variant of 'I have tools but no direction'. Works for code, art, hardware, writing, tools, and anything that can be made.
Generate project ideas through creative constraints. Constraint + direction = creativity.
## How It Works
@@ -1,13 +1,13 @@
---
name: design-md
description: Author/validate/export Google's DESIGN.md token spec files.
description: Author, validate, diff, and export DESIGN.md files — Google's open-source format spec that gives coding agents a persistent, structured understanding of a design system (tokens + rationale in one file). Use when building a design system, porting style rules between projects, generating UI with consistent brand, or auditing accessibility/contrast.
version: 1.0.0
author: Hermes Agent
license: MIT
metadata:
hermes:
tags: [design, design-system, tokens, ui, accessibility, wcag, tailwind, dtcg, google]
related_skills: [popular-web-designs, claude-design, excalidraw, architecture-diagram]
related_skills: [popular-web-designs, excalidraw, architecture-diagram]
---
# DESIGN.md Skill
@@ -31,9 +31,7 @@ diffs versions for regressions, and exports to Tailwind or W3C DTCG JSON.
- User wants contrast / WCAG accessibility validation on their color palette
For purely visual inspiration or layout examples, use `popular-web-designs`
instead. For *process and taste* when designing a one-off HTML artifact
from scratch (prototype, deck, landing page, component lab), use
`claude-design`. This skill is for the *formal spec file* itself.
instead. This skill is for the *formal spec file* itself.
## File anatomy
@@ -1,6 +1,6 @@
---
name: p5js
description: "p5.js sketches: gen art, shaders, interactive, 3D."
description: "Production pipeline for interactive and generative visual art using p5.js. Creates browser-based sketches, generative art, data visualizations, interactive experiences, 3D scenes, audio-reactive visuals, and motion graphics — exported as HTML, PNG, GIF, MP4, or SVG. Covers: 2D/3D rendering, noise and particle systems, flow fields, shaders (GLSL), pixel manipulation, kinetic typography, WebGL scenes, audio analysis, mouse/keyboard interaction, and headless high-res export. Use when users request: p5.js sketches, creative coding, generative art, interactive visualizations, canvas animations, browser-based visual art, data viz, shader effects, or any p5.js project."
version: 1.0.0
metadata:
hermes:
@@ -10,14 +10,6 @@ metadata:
# p5.js Production Pipeline
## When to use
Use when users request: p5.js sketches, creative coding, generative art, interactive visualizations, canvas animations, browser-based visual art, data viz, shader effects, or any p5.js project.
## What's inside
Production pipeline for interactive and generative visual art using p5.js. Creates browser-based sketches, generative art, data visualizations, interactive experiences, 3D scenes, audio-reactive visuals, and motion graphics — exported as HTML, PNG, GIF, MP4, or SVG. Covers: 2D/3D rendering, noise and particle systems, flow fields, shaders (GLSL), pixel manipulation, kinetic typography, WebGL scenes, audio analysis, mouse/keyboard interaction, and headless high-res export.
## Creative Standard
This is visual art rendered in the browser. The canvas is the medium; the algorithm is the brush.
@@ -1,6 +1,6 @@
---
name: pixel-art
description: "Pixel art w/ era palettes (NES, Game Boy, PICO-8)."
description: Convert images into retro pixel art with hardware-accurate palettes (NES, Game Boy, PICO-8, C64, etc.), and animate them into short videos. Presets cover arcade, SNES, and 10+ era-correct looks. Use `clarify` to let the user pick a style before generating.
version: 2.0.0
author: dodo-reach
license: MIT
@@ -1,6 +1,9 @@
---
name: songwriting-and-ai-music
description: "Songwriting craft and Suno AI music prompts."
description: >
Songwriting craft, AI music generation prompts (Suno focus), parody/adaptation
techniques, phonetic tricks, and lessons learned. These are tools and ideas,
not rules. Break any of them when the art calls for it.
tags: [songwriting, music, suno, parody, lyrics, creative]
triggers:
- writing a song
@@ -1,6 +1,11 @@
---
name: jupyter-live-kernel
description: "Iterative Python via live Jupyter kernel (hamelnb)."
description: >
Use a live Jupyter kernel for stateful, iterative Python execution via hamelnb.
Load this skill when the task involves exploration, iteration, or inspecting
intermediate results — data science, ML experimentation, API exploration, or
building up complex code step-by-step. Uses terminal to run CLI commands against
a live Jupyter kernel. No new tools required.
version: 1.0.0
author: Hermes Agent
license: MIT
@@ -1,6 +1,6 @@
---
name: webhook-subscriptions
description: "Webhook subscriptions: event-driven agent runs."
description: Create and manage webhook subscriptions for event-driven agent activation, or for direct push notifications (zero LLM cost). Use when the user wants external services to trigger agent runs OR push notifications to chats.
version: 1.1.0
metadata:
hermes:
@@ -1,6 +1,6 @@
---
name: dogfood
description: "Exploratory QA of web apps: find bugs, evidence, reports."
description: Systematic exploratory QA testing of web applications — find bugs, capture evidence, and generate structured reports
version: 1.0.0
metadata:
hermes:
@@ -1,6 +1,6 @@
---
name: himalaya
description: "Himalaya CLI: IMAP/SMTP email from terminal."
description: CLI to manage emails via IMAP/SMTP. Use himalaya to list, read, write, reply, forward, search, and organize emails from the terminal. Supports multiple accounts and message composition with MML (MIME Meta Language).
version: 1.0.0
author: community
license: MIT
@@ -1,6 +1,6 @@
---
name: codebase-inspection
description: "Inspect codebases w/ pygount: LOC, languages, ratios."
description: Inspect and analyze codebases using pygount for LOC counting, language breakdown, and code-vs-comment ratios. Use when asked to check lines of code, repo size, language composition, or codebase stats.
version: 1.0.0
author: Hermes Agent
license: MIT
@@ -1,6 +1,6 @@
---
name: github-auth
description: "GitHub auth setup: HTTPS tokens, SSH keys, gh CLI login."
description: Set up GitHub authentication for the agent using git (universally available) or the gh CLI. Covers HTTPS tokens, SSH keys, credential helpers, and gh auth — with a detection flow to pick the right method automatically.
version: 1.1.0
author: Hermes Agent
license: MIT
@@ -1,6 +1,6 @@
---
name: github-code-review
description: "Review PRs: diffs, inline comments via gh or REST."
description: Review code changes by analyzing git diffs, leaving inline comments on PRs, and performing thorough pre-push review. Works with gh CLI or falls back to git + GitHub REST API via curl.
version: 1.1.0
author: Hermes Agent
license: MIT
@@ -1,6 +1,6 @@
---
name: github-issues
description: "Create, triage, label, assign GitHub issues via gh or REST."
description: Create, manage, triage, and close GitHub issues. Search existing issues, add labels, assign people, and link to PRs. Works with gh CLI or falls back to git + GitHub REST API via curl.
version: 1.1.0
author: Hermes Agent
license: MIT
@@ -1,6 +1,6 @@
---
name: github-pr-workflow
description: "GitHub PR lifecycle: branch, commit, open, CI, merge."
description: Full pull request lifecycle — create branches, commit changes, open PRs, monitor CI status, auto-fix failures, and merge. Works with gh CLI or falls back to git + GitHub REST API via curl.
version: 1.1.0
author: Hermes Agent
license: MIT
@@ -1,6 +1,6 @@
---
name: github-repo-management
description: "Clone/create/fork repos; manage remotes, releases."
description: Clone, create, fork, configure, and manage GitHub repositories. Manage remotes, secrets, releases, and workflows. Works with gh CLI or falls back to git + GitHub REST API via curl.
version: 1.1.0
author: Hermes Agent
license: MIT
@@ -1,6 +1,6 @@
---
name: native-mcp
description: "MCP client: connect servers, register tools (stdio/HTTP)."
description: Built-in MCP (Model Context Protocol) client that connects to external MCP servers, discovers their tools, and registers them as native Hermes Agent tools. Supports stdio and HTTP transports with automatic reconnection, security filtering, and zero-config tool injection.
version: 1.0.0
author: Hermes Agent
license: MIT
@@ -1,6 +1,6 @@
---
name: gif-search
description: "Search/download GIFs from Tenor via curl + jq."
description: Search and download GIFs from Tenor using curl. No dependencies beyond curl and jq. Useful for finding reaction GIFs, creating visual content, and sending GIFs in chat.
version: 1.1.0
author: Hermes Agent
license: MIT
@@ -16,10 +16,6 @@ metadata:
Search and download GIFs directly via the Tenor API using curl. No extra tools needed.
## When to use
Useful for finding reaction GIFs, creating visual content, and sending GIFs in chat.
## Setup
Set your Tenor API key in your environment (add to `~/.hermes/.env`):
@@ -1,6 +1,6 @@
---
name: songsee
description: "Audio spectrograms/features (mel, chroma, MFCC) via CLI."
description: Generate spectrograms and audio feature visualizations (mel, chroma, MFCC, tempogram, etc.) from audio files via CLI. Useful for audio analysis, music production debugging, and visual documentation.
version: 1.0.0
author: community
license: MIT
@@ -0,0 +1,490 @@
# API Evaluation
Guide to evaluating OpenAI, Anthropic, and other API-based language models.
## Overview
The lm-evaluation-harness supports evaluating API-based models through a unified `TemplateAPI` interface. This allows benchmarking of:
- OpenAI models (GPT-4, GPT-3.5, etc.)
- Anthropic models (Claude 3, Claude 2, etc.)
- Local OpenAI-compatible APIs
- Custom API endpoints
**Why evaluate API models**:
- Benchmark closed-source models
- Compare API models to open models
- Validate API performance
- Track model updates over time
## Supported API Models
| Provider | Model Type | Request Types | Logprobs |
|----------|------------|---------------|----------|
| OpenAI (completions) | `openai-completions` | All | ✅ Yes |
| OpenAI (chat) | `openai-chat-completions` | `generate_until` only | ❌ No |
| Anthropic (completions) | `anthropic-completions` | All | ❌ No |
| Anthropic (chat) | `anthropic-chat` | `generate_until` only | ❌ No |
| Local (OpenAI-compatible) | `local-completions` | Depends on server | Varies |
**Note**: Models without logprobs can only be evaluated on generation tasks, not perplexity or loglikelihood tasks.
## OpenAI Models
### Setup
```bash
export OPENAI_API_KEY=sk-...
```
### Completion Models (Legacy)
**Available models**: `davinci-002`, `babbage-002`
```bash
lm_eval --model openai-completions \
--model_args model=davinci-002 \
--tasks lambada_openai,hellaswag \
--batch_size auto
```
**Supports**:
- `generate_until`: ✅
- `loglikelihood`: ✅
- `loglikelihood_rolling`: ✅
### Chat Models
**Available models**: `gpt-4`, `gpt-4-turbo`, `gpt-3.5-turbo`
```bash
lm_eval --model openai-chat-completions \
--model_args model=gpt-4-turbo \
--tasks mmlu,gsm8k,humaneval \
--num_fewshot 5 \
--batch_size auto
```
**Supports**:
- `generate_until`: ✅
- `loglikelihood`: ❌ (no logprobs)
- `loglikelihood_rolling`: ❌
**Important**: Chat models don't provide logprobs, so they can only be used with generation tasks (MMLU, GSM8K, HumanEval), not perplexity tasks.
### Configuration Options
```bash
lm_eval --model openai-chat-completions \
--model_args \
model=gpt-4-turbo,\
base_url=https://api.openai.com/v1,\
num_concurrent=5,\
max_retries=3,\
timeout=60,\
batch_size=auto
```
**Parameters**:
- `model`: Model identifier (required)
- `base_url`: API endpoint (default: OpenAI)
- `num_concurrent`: Concurrent requests (default: 5)
- `max_retries`: Retry failed requests (default: 3)
- `timeout`: Request timeout in seconds (default: 60)
- `tokenizer`: Tokenizer to use (default: matches model)
- `tokenizer_backend`: `"tiktoken"` or `"huggingface"`
### Cost Management
OpenAI charges per token. Estimate costs before running:
```python
# Rough estimate
num_samples = 1000
avg_tokens_per_sample = 500 # input + output
cost_per_1k_tokens = 0.01 # GPT-3.5 Turbo
total_cost = (num_samples * avg_tokens_per_sample / 1000) * cost_per_1k_tokens
print(f"Estimated cost: ${total_cost:.2f}")
```
**Cost-saving tips**:
- Use `--limit N` for testing
- Start with `gpt-3.5-turbo` before `gpt-4`
- Set `max_gen_toks` to minimum needed
- Use `num_fewshot=0` for zero-shot when possible
## Anthropic Models
### Setup
```bash
export ANTHROPIC_API_KEY=sk-ant-...
```
### Completion Models (Legacy)
```bash
lm_eval --model anthropic-completions \
--model_args model=claude-2.1 \
--tasks lambada_openai,hellaswag \
--batch_size auto
```
### Chat Models (Recommended)
**Available models**: `claude-3-5-sonnet-20241022`, `claude-3-opus-20240229`, `claude-3-sonnet-20240229`, `claude-3-haiku-20240307`
```bash
lm_eval --model anthropic-chat \
--model_args model=claude-3-5-sonnet-20241022 \
--tasks mmlu,gsm8k,humaneval \
--num_fewshot 5 \
--batch_size auto
```
**Aliases**: `anthropic-chat-completions` (same as `anthropic-chat`)
### Configuration Options
```bash
lm_eval --model anthropic-chat \
--model_args \
model=claude-3-5-sonnet-20241022,\
base_url=https://api.anthropic.com,\
num_concurrent=5,\
max_retries=3,\
timeout=60
```
### Cost Management
Anthropic pricing (as of 2024):
- Claude 3.5 Sonnet: $3.00 / 1M input, $15.00 / 1M output
- Claude 3 Opus: $15.00 / 1M input, $75.00 / 1M output
- Claude 3 Haiku: $0.25 / 1M input, $1.25 / 1M output
**Budget-friendly strategy**:
```bash
# Test on small sample first
lm_eval --model anthropic-chat \
--model_args model=claude-3-haiku-20240307 \
--tasks mmlu \
--limit 100
# Then run full eval on best model
lm_eval --model anthropic-chat \
--model_args model=claude-3-5-sonnet-20241022 \
--tasks mmlu \
--num_fewshot 5
```
## Local OpenAI-Compatible APIs
Many local inference servers expose OpenAI-compatible APIs (vLLM, Text Generation Inference, llama.cpp, Ollama).
### vLLM Local Server
**Start server**:
```bash
vllm serve meta-llama/Llama-2-7b-hf \
--host 0.0.0.0 \
--port 8000
```
**Evaluate**:
```bash
lm_eval --model local-completions \
--model_args \
model=meta-llama/Llama-2-7b-hf,\
base_url=http://localhost:8000/v1,\
num_concurrent=1 \
--tasks mmlu,gsm8k \
--batch_size auto
```
### Text Generation Inference (TGI)
**Start server**:
```bash
docker run --gpus all --shm-size 1g -p 8080:80 \
ghcr.io/huggingface/text-generation-inference:latest \
--model-id meta-llama/Llama-2-7b-hf
```
**Evaluate**:
```bash
lm_eval --model local-completions \
--model_args \
model=meta-llama/Llama-2-7b-hf,\
base_url=http://localhost:8080/v1 \
--tasks hellaswag,arc_challenge
```
### Ollama
**Start server**:
```bash
ollama serve
ollama pull llama2:7b
```
**Evaluate**:
```bash
lm_eval --model local-completions \
--model_args \
model=llama2:7b,\
base_url=http://localhost:11434/v1 \
--tasks mmlu
```
### llama.cpp Server
**Start server**:
```bash
./server -m models/llama-2-7b.gguf --host 0.0.0.0 --port 8080
```
**Evaluate**:
```bash
lm_eval --model local-completions \
--model_args \
model=llama2,\
base_url=http://localhost:8080/v1 \
--tasks gsm8k
```
## Custom API Implementation
For custom API endpoints, subclass `TemplateAPI`:
### Create `my_api.py`
```python
from lm_eval.models.api_models import TemplateAPI
import requests
class MyCustomAPI(TemplateAPI):
"""Custom API model."""
def __init__(self, base_url, api_key, **kwargs):
super().__init__(base_url=base_url, **kwargs)
self.api_key = api_key
def _create_payload(self, messages, gen_kwargs):
"""Create API request payload."""
return {
"messages": messages,
"api_key": self.api_key,
**gen_kwargs
}
def parse_generations(self, response):
"""Parse generation response."""
return response.json()["choices"][0]["text"]
def parse_logprobs(self, response):
"""Parse logprobs (if available)."""
# Return None if API doesn't provide logprobs
logprobs = response.json().get("logprobs")
if logprobs:
return logprobs["token_logprobs"]
return None
```
### Register and Use
```python
from lm_eval import evaluator
from my_api import MyCustomAPI
model = MyCustomAPI(
base_url="https://api.example.com/v1",
api_key="your-key"
)
results = evaluator.simple_evaluate(
model=model,
tasks=["mmlu", "gsm8k"],
num_fewshot=5,
batch_size="auto"
)
```
## Comparing API and Open Models
### Side-by-Side Evaluation
```bash
# Evaluate OpenAI GPT-4
lm_eval --model openai-chat-completions \
--model_args model=gpt-4-turbo \
--tasks mmlu,gsm8k,hellaswag \
--num_fewshot 5 \
--output_path results/gpt4.json
# Evaluate open Llama 2 70B
lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-2-70b-hf,dtype=bfloat16 \
--tasks mmlu,gsm8k,hellaswag \
--num_fewshot 5 \
--output_path results/llama2-70b.json
# Compare results
python scripts/compare_results.py \
results/gpt4.json \
results/llama2-70b.json
```
### Typical Comparisons
| Model | MMLU | GSM8K | HumanEval | Cost |
|-------|------|-------|-----------|------|
| GPT-4 Turbo | 86.4% | 92.0% | 67.0% | $$$$ |
| Claude 3 Opus | 86.8% | 95.0% | 84.9% | $$$$ |
| GPT-3.5 Turbo | 70.0% | 57.1% | 48.1% | $$ |
| Llama 2 70B | 68.9% | 56.8% | 29.9% | Free (self-host) |
| Mixtral 8x7B | 70.6% | 58.4% | 40.2% | Free (self-host) |
## Best Practices
### Rate Limiting
Respect API rate limits:
```bash
lm_eval --model openai-chat-completions \
--model_args \
model=gpt-4-turbo,\
num_concurrent=3,\ # Lower concurrency
timeout=120 \ # Longer timeout
--tasks mmlu
```
### Reproducibility
Set temperature to 0 for deterministic results:
```bash
lm_eval --model openai-chat-completions \
--model_args model=gpt-4-turbo \
--tasks mmlu \
--gen_kwargs temperature=0.0
```
Or use `seed` for sampling:
```bash
lm_eval --model anthropic-chat \
--model_args model=claude-3-5-sonnet-20241022 \
--tasks gsm8k \
--gen_kwargs temperature=0.7,seed=42
```
### Caching
API models automatically cache responses to avoid redundant calls:
```bash
# First run: makes API calls
lm_eval --model openai-chat-completions \
--model_args model=gpt-4-turbo \
--tasks mmlu \
--limit 100
# Second run: uses cache (instant, free)
lm_eval --model openai-chat-completions \
--model_args model=gpt-4-turbo \
--tasks mmlu \
--limit 100
```
Cache location: `~/.cache/lm_eval/`
### Error Handling
APIs can fail. Use retries:
```bash
lm_eval --model openai-chat-completions \
--model_args \
model=gpt-4-turbo,\
max_retries=5,\
timeout=120 \
--tasks mmlu
```
## Troubleshooting
### "Authentication failed"
Check API key:
```bash
echo $OPENAI_API_KEY # Should print sk-...
echo $ANTHROPIC_API_KEY # Should print sk-ant-...
```
### "Rate limit exceeded"
Reduce concurrency:
```bash
--model_args num_concurrent=1
```
Or add delays between requests.
### "Timeout error"
Increase timeout:
```bash
--model_args timeout=180
```
### "Model not found"
For local APIs, verify server is running:
```bash
curl http://localhost:8000/v1/models
```
### Cost Runaway
Use `--limit` for testing:
```bash
lm_eval --model openai-chat-completions \
--model_args model=gpt-4-turbo \
--tasks mmlu \
--limit 50 # Only 50 samples
```
## Advanced Features
### Custom Headers
```bash
lm_eval --model local-completions \
--model_args \
base_url=http://api.example.com/v1,\
header="Authorization: Bearer token,X-Custom: value"
```
### Disable SSL Verification (Development Only)
```bash
lm_eval --model local-completions \
--model_args \
base_url=https://localhost:8000/v1,\
verify_certificate=false
```
### Custom Tokenizer
```bash
lm_eval --model openai-chat-completions \
--model_args \
model=gpt-4-turbo,\
tokenizer=gpt2,\
tokenizer_backend=huggingface
```
## References
- OpenAI API: https://platform.openai.com/docs/api-reference
- Anthropic API: https://docs.anthropic.com/claude/reference
- TemplateAPI: `lm_eval/models/api_models.py`
- OpenAI models: `lm_eval/models/openai_completions.py`
- Anthropic models: `lm_eval/models/anthropic_llms.py`
@@ -0,0 +1,488 @@
# Benchmark Guide
Complete guide to all 60+ evaluation tasks in lm-evaluation-harness, what they measure, and how to interpret results.
## Overview
The lm-evaluation-harness includes 60+ benchmarks spanning:
- Language understanding (MMLU, GLUE)
- Mathematical reasoning (GSM8K, MATH)
- Code generation (HumanEval, MBPP)
- Instruction following (IFEval, AlpacaEval)
- Long-context understanding (LongBench)
- Multilingual capabilities (AfroBench, NorEval)
- Reasoning (BBH, ARC)
- Truthfulness (TruthfulQA)
**List all tasks**:
```bash
lm_eval --tasks list
```
## Major Benchmarks
### MMLU (Massive Multitask Language Understanding)
**What it measures**: Broad knowledge across 57 subjects (STEM, humanities, social sciences, law).
**Task variants**:
- `mmlu`: Original 57-subject benchmark
- `mmlu_pro`: More challenging version with reasoning-focused questions
- `mmlu_prox`: Multilingual extension
**Format**: Multiple choice (4 options)
**Example**:
```
Question: What is the capital of France?
A. Berlin
B. Paris
C. London
D. Madrid
Answer: B
```
**Command**:
```bash
lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-2-7b-hf \
--tasks mmlu \
--num_fewshot 5
```
**Interpretation**:
- Random: 25% (chance)
- GPT-3 (175B): 43.9%
- GPT-4: 86.4%
- Human expert: ~90%
**Good for**: Assessing general knowledge and domain expertise.
### GSM8K (Grade School Math 8K)
**What it measures**: Mathematical reasoning on grade-school level word problems.
**Task variants**:
- `gsm8k`: Base task
- `gsm8k_cot`: With chain-of-thought prompting
- `gsm_plus`: Adversarial variant with perturbations
**Format**: Free-form generation, extract numerical answer
**Example**:
```
Question: A baker made 200 cookies. He sold 3/5 of them in the morning and 1/4 of the remaining in the afternoon. How many cookies does he have left?
Answer: 60
```
**Command**:
```bash
lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-2-7b-hf \
--tasks gsm8k \
--num_fewshot 5
```
**Interpretation**:
- Random: ~0%
- GPT-3 (175B): 17.0%
- GPT-4: 92.0%
- Llama 2 70B: 56.8%
**Good for**: Testing multi-step reasoning and arithmetic.
### HumanEval
**What it measures**: Python code generation from docstrings (functional correctness).
**Task variants**:
- `humaneval`: Standard benchmark
- `humaneval_instruct`: For instruction-tuned models
**Format**: Code generation, execution-based evaluation
**Example**:
```python
def has_close_elements(numbers: List[float], threshold: float) -> bool:
""" Check if in given list of numbers, are any two numbers closer to each other than
given threshold.
>>> has_close_elements([1.0, 2.0, 3.0], 0.5)
False
>>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
True
"""
```
**Command**:
```bash
lm_eval --model hf \
--model_args pretrained=codellama/CodeLlama-7b-hf \
--tasks humaneval \
--batch_size 1
```
**Interpretation**:
- Random: 0%
- GPT-3 (175B): 0%
- Codex: 28.8%
- GPT-4: 67.0%
- Code Llama 34B: 53.7%
**Good for**: Evaluating code generation capabilities.
### BBH (BIG-Bench Hard)
**What it measures**: 23 challenging reasoning tasks where models previously failed to beat humans.
**Categories**:
- Logical reasoning
- Math word problems
- Social understanding
- Algorithmic reasoning
**Format**: Multiple choice and free-form
**Command**:
```bash
lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-2-7b-hf \
--tasks bbh \
--num_fewshot 3
```
**Interpretation**:
- Random: ~25%
- GPT-3 (175B): 33.9%
- PaLM 540B: 58.3%
- GPT-4: 86.7%
**Good for**: Testing advanced reasoning capabilities.
### IFEval (Instruction-Following Evaluation)
**What it measures**: Ability to follow specific, verifiable instructions.
**Instruction types**:
- Format constraints (e.g., "answer in 3 sentences")
- Length constraints (e.g., "use at least 100 words")
- Content constraints (e.g., "include the word 'banana'")
- Structural constraints (e.g., "use bullet points")
**Format**: Free-form generation with rule-based verification
**Command**:
```bash
lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-2-7b-chat-hf \
--tasks ifeval \
--batch_size auto
```
**Interpretation**:
- Measures: Instruction adherence (not quality)
- GPT-4: 86% instruction following
- Claude 2: 84%
**Good for**: Evaluating chat/instruct models.
### GLUE (General Language Understanding Evaluation)
**What it measures**: Natural language understanding across 9 tasks.
**Tasks**:
- `cola`: Grammatical acceptability
- `sst2`: Sentiment analysis
- `mrpc`: Paraphrase detection
- `qqp`: Question pairs
- `stsb`: Semantic similarity
- `mnli`: Natural language inference
- `qnli`: Question answering NLI
- `rte`: Recognizing textual entailment
- `wnli`: Winograd schemas
**Command**:
```bash
lm_eval --model hf \
--model_args pretrained=bert-base-uncased \
--tasks glue \
--num_fewshot 0
```
**Interpretation**:
- BERT Base: 78.3 (GLUE score)
- RoBERTa Large: 88.5
- Human baseline: 87.1
**Good for**: Encoder-only models, fine-tuning baselines.
### LongBench
**What it measures**: Long-context understanding (4K-32K tokens).
**21 tasks covering**:
- Single-document QA
- Multi-document QA
- Summarization
- Few-shot learning
- Code completion
- Synthetic tasks
**Command**:
```bash
lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-2-7b-hf \
--tasks longbench \
--batch_size 1
```
**Interpretation**:
- Tests context utilization
- Many models struggle beyond 4K tokens
- GPT-4 Turbo: 54.3%
**Good for**: Evaluating long-context models.
## Additional Benchmarks
### TruthfulQA
**What it measures**: Model's propensity to be truthful vs. generate plausible-sounding falsehoods.
**Format**: Multiple choice with 4-5 options
**Command**:
```bash
lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-2-7b-hf \
--tasks truthfulqa_mc2 \
--batch_size auto
```
**Interpretation**:
- Larger models often score worse (more convincing lies)
- GPT-3: 58.8%
- GPT-4: 59.0%
- Human: ~94%
### ARC (AI2 Reasoning Challenge)
**What it measures**: Grade-school science questions.
**Variants**:
- `arc_easy`: Easier questions
- `arc_challenge`: Harder questions requiring reasoning
**Command**:
```bash
lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-2-7b-hf \
--tasks arc_challenge \
--num_fewshot 25
```
**Interpretation**:
- ARC-Easy: Most models >80%
- ARC-Challenge random: 25%
- GPT-4: 96.3%
### HellaSwag
**What it measures**: Commonsense reasoning about everyday situations.
**Format**: Choose most plausible continuation
**Command**:
```bash
lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-2-7b-hf \
--tasks hellaswag \
--num_fewshot 10
```
**Interpretation**:
- Random: 25%
- GPT-3: 78.9%
- Llama 2 70B: 85.3%
### WinoGrande
**What it measures**: Commonsense reasoning via pronoun resolution.
**Example**:
```
The trophy doesn't fit in the brown suitcase because _ is too large.
A. the trophy
B. the suitcase
```
**Command**:
```bash
lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-2-7b-hf \
--tasks winogrande \
--num_fewshot 5
```
### PIQA
**What it measures**: Physical commonsense reasoning.
**Example**: "To clean a keyboard, use compressed air or..."
**Command**:
```bash
lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-2-7b-hf \
--tasks piqa
```
## Multilingual Benchmarks
### AfroBench
**What it measures**: Performance across 64 African languages.
**15 tasks**: NLU, text generation, knowledge, QA, math reasoning
**Command**:
```bash
lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-2-7b-hf \
--tasks afrobench
```
### NorEval
**What it measures**: Norwegian language understanding (9 task categories).
**Command**:
```bash
lm_eval --model hf \
--model_args pretrained=NbAiLab/nb-gpt-j-6B \
--tasks noreval
```
## Domain-Specific Benchmarks
### MATH
**What it measures**: High-school competition math problems.
**Command**:
```bash
lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-2-7b-hf \
--tasks math \
--num_fewshot 4
```
**Interpretation**:
- Very challenging
- GPT-4: 42.5%
- Minerva 540B: 33.6%
### MBPP (Mostly Basic Python Problems)
**What it measures**: Python programming from natural language descriptions.
**Command**:
```bash
lm_eval --model hf \
--model_args pretrained=codellama/CodeLlama-7b-hf \
--tasks mbpp \
--batch_size 1
```
### DROP
**What it measures**: Reading comprehension requiring discrete reasoning.
**Command**:
```bash
lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-2-7b-hf \
--tasks drop
```
## Benchmark Selection Guide
### For General Purpose Models
Run this suite:
```bash
lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-2-7b-hf \
--tasks mmlu,gsm8k,hellaswag,arc_challenge,truthfulqa_mc2 \
--num_fewshot 5
```
### For Code Models
```bash
lm_eval --model hf \
--model_args pretrained=codellama/CodeLlama-7b-hf \
--tasks humaneval,mbpp \
--batch_size 1
```
### For Chat/Instruct Models
```bash
lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-2-7b-chat-hf \
--tasks ifeval,mmlu,gsm8k_cot \
--batch_size auto
```
### For Long Context Models
```bash
lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-3.1-8B \
--tasks longbench \
--batch_size 1
```
## Interpreting Results
### Understanding Metrics
**Accuracy**: Percentage of correct answers (most common)
**Exact Match (EM)**: Requires exact string match (strict)
**F1 Score**: Balances precision and recall
**BLEU/ROUGE**: Text generation similarity
**Pass@k**: Percentage passing when generating k samples
### Typical Score Ranges
| Model Size | MMLU | GSM8K | HumanEval | HellaSwag |
|------------|------|-------|-----------|-----------|
| 7B | 40-50% | 10-20% | 5-15% | 70-80% |
| 13B | 45-55% | 20-35% | 15-25% | 75-82% |
| 70B | 60-70% | 50-65% | 35-50% | 82-87% |
| GPT-4 | 86% | 92% | 67% | 95% |
### Red Flags
- **All tasks at random chance**: Model not trained properly
- **Exact 0% on generation tasks**: Likely format/parsing issue
- **Huge variance across runs**: Check seed/sampling settings
- **Better than GPT-4 on everything**: Likely contamination
## Best Practices
1. **Always report few-shot setting**: 0-shot, 5-shot, etc.
2. **Run multiple seeds**: Report mean ± std
3. **Check for data contamination**: Search training data for benchmark examples
4. **Compare to published baselines**: Validate your setup
5. **Report all hyperparameters**: Model, batch size, max tokens, temperature
## References
- Task list: `lm_eval --tasks list`
- Task README: `lm_eval/tasks/README.md`
- Papers: See individual benchmark papers
@@ -0,0 +1,602 @@
# Custom Tasks
Complete guide to creating domain-specific evaluation tasks in lm-evaluation-harness.
## Overview
Custom tasks allow you to evaluate models on your own datasets and metrics. Tasks are defined using YAML configuration files with optional Python utilities for complex logic.
**Why create custom tasks**:
- Evaluate on proprietary/domain-specific data
- Test specific capabilities not covered by existing benchmarks
- Create evaluation pipelines for internal models
- Reproduce research experiments
## Quick Start
### Minimal Custom Task
Create `my_tasks/simple_qa.yaml`:
```yaml
task: simple_qa
dataset_path: data/simple_qa.jsonl
output_type: generate_until
doc_to_text: "Question: {{question}}\nAnswer:"
doc_to_target: "{{answer}}"
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
```
**Run it**:
```bash
lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-2-7b-hf \
--tasks simple_qa \
--include_path my_tasks/
```
## Task Configuration Reference
### Essential Fields
```yaml
# Task identification
task: my_custom_task # Unique task name (required)
task_alias: "My Task" # Display name
tag: # Tags for grouping
- custom
- domain_specific
# Dataset configuration
dataset_path: data/my_data.jsonl # HuggingFace dataset or local path
dataset_name: default # Subset name (if applicable)
training_split: train
validation_split: validation
test_split: test
# Evaluation configuration
output_type: generate_until # or loglikelihood, multiple_choice
num_fewshot: 5 # Number of few-shot examples
batch_size: auto # Batch size
# Prompt templates (Jinja2)
doc_to_text: "Question: {{question}}"
doc_to_target: "{{answer}}"
# Metrics
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
# Metadata
metadata:
version: 1.0
```
### Output Types
**`generate_until`**: Free-form generation
```yaml
output_type: generate_until
generation_kwargs:
max_gen_toks: 256
until:
- "\n"
- "."
temperature: 0.0
```
**`loglikelihood`**: Compute log probability of targets
```yaml
output_type: loglikelihood
# Used for perplexity, classification
```
**`multiple_choice`**: Choose from options
```yaml
output_type: multiple_choice
doc_to_choice: "{{choices}}" # List of choices
```
## Data Formats
### Local JSONL File
`data/my_data.jsonl`:
```json
{"question": "What is 2+2?", "answer": "4"}
{"question": "Capital of France?", "answer": "Paris"}
```
**Task config**:
```yaml
dataset_path: data/my_data.jsonl
dataset_kwargs:
data_files:
test: data/my_data.jsonl
```
### HuggingFace Dataset
```yaml
dataset_path: squad
dataset_name: plain_text
test_split: validation
```
### CSV File
`data/my_data.csv`:
```csv
question,answer,category
What is 2+2?,4,math
Capital of France?,Paris,geography
```
**Task config**:
```yaml
dataset_path: data/my_data.csv
dataset_kwargs:
data_files:
test: data/my_data.csv
```
## Prompt Engineering
### Simple Template
```yaml
doc_to_text: "Question: {{question}}\nAnswer:"
doc_to_target: "{{answer}}"
```
### Conditional Logic
```yaml
doc_to_text: |
{% if context %}
Context: {{context}}
{% endif %}
Question: {{question}}
Answer:
```
### Multiple Choice
```yaml
doc_to_text: |
Question: {{question}}
A. {{choices[0]}}
B. {{choices[1]}}
C. {{choices[2]}}
D. {{choices[3]}}
Answer:
doc_to_target: "{{ 'ABCD'[answer_idx] }}"
doc_to_choice: ["A", "B", "C", "D"]
```
### Few-Shot Formatting
```yaml
fewshot_delimiter: "\n\n" # Between examples
target_delimiter: " " # Between question and answer
doc_to_text: "Q: {{question}}"
doc_to_target: "A: {{answer}}"
```
## Custom Python Functions
For complex logic, use Python functions in `utils.py`.
### Create `my_tasks/utils.py`
```python
def process_docs(dataset):
"""Preprocess documents."""
def _process(doc):
# Custom preprocessing
doc["question"] = doc["question"].strip().lower()
return doc
return dataset.map(_process)
def doc_to_text(doc):
"""Custom prompt formatting."""
context = doc.get("context", "")
question = doc["question"]
if context:
return f"Context: {context}\nQuestion: {question}\nAnswer:"
return f"Question: {question}\nAnswer:"
def doc_to_target(doc):
"""Custom target extraction."""
return doc["answer"].strip().lower()
def aggregate_scores(items):
"""Custom metric aggregation."""
correct = sum(1 for item in items if item == 1.0)
total = len(items)
return correct / total if total > 0 else 0.0
```
### Use in Task Config
```yaml
task: my_custom_task
dataset_path: data/my_data.jsonl
# Use Python functions
process_docs: !function utils.process_docs
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
metric_list:
- metric: exact_match
aggregation: !function utils.aggregate_scores
higher_is_better: true
```
## Real-World Examples
### Example 1: Domain QA Task
**Goal**: Evaluate medical question answering.
`medical_qa/medical_qa.yaml`:
```yaml
task: medical_qa
dataset_path: data/medical_qa.jsonl
output_type: generate_until
num_fewshot: 3
doc_to_text: |
Medical Question: {{question}}
Context: {{context}}
Answer (be concise):
doc_to_target: "{{answer}}"
generation_kwargs:
max_gen_toks: 100
until:
- "\n\n"
temperature: 0.0
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
- metric: !function utils.medical_f1
aggregation: mean
higher_is_better: true
filter_list:
- name: lowercase
filter:
- function: lowercase
- function: remove_whitespace
metadata:
version: 1.0
domain: medical
```
`medical_qa/utils.py`:
```python
from sklearn.metrics import f1_score
import re
def medical_f1(predictions, references):
"""Custom F1 for medical terms."""
pred_terms = set(extract_medical_terms(predictions[0]))
ref_terms = set(extract_medical_terms(references[0]))
if not pred_terms and not ref_terms:
return 1.0
if not pred_terms or not ref_terms:
return 0.0
tp = len(pred_terms & ref_terms)
fp = len(pred_terms - ref_terms)
fn = len(ref_terms - pred_terms)
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
return 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
def extract_medical_terms(text):
"""Extract medical terminology."""
# Custom logic
return re.findall(r'\b[A-Z][a-z]+(?:[A-Z][a-z]+)*\b', text)
```
### Example 2: Code Evaluation
`code_eval/python_challenges.yaml`:
```yaml
task: python_challenges
dataset_path: data/python_problems.jsonl
output_type: generate_until
num_fewshot: 0
doc_to_text: |
Write a Python function to solve:
{{problem_statement}}
Function signature:
{{function_signature}}
doc_to_target: "{{canonical_solution}}"
generation_kwargs:
max_gen_toks: 512
until:
- "\n\nclass"
- "\n\ndef"
temperature: 0.2
metric_list:
- metric: !function utils.execute_code
aggregation: mean
higher_is_better: true
process_results: !function utils.process_code_results
metadata:
version: 1.0
```
`code_eval/utils.py`:
```python
import subprocess
import json
def execute_code(predictions, references):
"""Execute generated code against test cases."""
generated_code = predictions[0]
test_cases = json.loads(references[0])
try:
# Execute code with test cases
for test_input, expected_output in test_cases:
result = execute_with_timeout(generated_code, test_input, timeout=5)
if result != expected_output:
return 0.0
return 1.0
except Exception:
return 0.0
def execute_with_timeout(code, input_data, timeout=5):
"""Safely execute code with timeout."""
# Implementation with subprocess and timeout
pass
def process_code_results(doc, results):
"""Process code execution results."""
return {
"passed": results[0] == 1.0,
"generated_code": results[1]
}
```
### Example 3: Instruction Following
`instruction_eval/instruction_eval.yaml`:
```yaml
task: instruction_following
dataset_path: data/instructions.jsonl
output_type: generate_until
num_fewshot: 0
doc_to_text: |
Instruction: {{instruction}}
{% if constraints %}
Constraints: {{constraints}}
{% endif %}
Response:
doc_to_target: "{{expected_response}}"
generation_kwargs:
max_gen_toks: 256
temperature: 0.7
metric_list:
- metric: !function utils.check_constraints
aggregation: mean
higher_is_better: true
- metric: !function utils.semantic_similarity
aggregation: mean
higher_is_better: true
process_docs: !function utils.add_constraint_checkers
```
`instruction_eval/utils.py`:
```python
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')
def check_constraints(predictions, references):
"""Check if response satisfies constraints."""
response = predictions[0]
constraints = json.loads(references[0])
satisfied = 0
total = len(constraints)
for constraint in constraints:
if verify_constraint(response, constraint):
satisfied += 1
return satisfied / total if total > 0 else 1.0
def verify_constraint(response, constraint):
"""Verify single constraint."""
if constraint["type"] == "length":
return len(response.split()) >= constraint["min_words"]
elif constraint["type"] == "contains":
return constraint["keyword"] in response.lower()
# Add more constraint types
return True
def semantic_similarity(predictions, references):
"""Compute semantic similarity."""
pred_embedding = model.encode(predictions[0])
ref_embedding = model.encode(references[0])
return float(util.cos_sim(pred_embedding, ref_embedding))
def add_constraint_checkers(dataset):
"""Parse constraints into verifiable format."""
def _parse(doc):
# Parse constraint string into structured format
doc["parsed_constraints"] = parse_constraints(doc.get("constraints", ""))
return doc
return dataset.map(_parse)
```
## Advanced Features
### Output Filtering
```yaml
filter_list:
- name: extract_answer
filter:
- function: regex
regex_pattern: "Answer: (.*)"
group: 1
- function: lowercase
- function: strip_whitespace
```
### Multiple Metrics
```yaml
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
- metric: f1
aggregation: mean
higher_is_better: true
- metric: bleu
aggregation: mean
higher_is_better: true
```
### Task Groups
Create `my_tasks/_default.yaml`:
```yaml
group: my_eval_suite
task:
- simple_qa
- medical_qa
- python_challenges
```
**Run entire suite**:
```bash
lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-2-7b-hf \
--tasks my_eval_suite \
--include_path my_tasks/
```
## Testing Your Task
### Validate Configuration
```bash
# Test task loading
lm_eval --tasks my_custom_task --include_path my_tasks/ --limit 0
# Run on 5 samples
lm_eval --model hf \
--model_args pretrained=gpt2 \
--tasks my_custom_task \
--include_path my_tasks/ \
--limit 5
```
### Debug Mode
```bash
lm_eval --model hf \
--model_args pretrained=gpt2 \
--tasks my_custom_task \
--include_path my_tasks/ \
--limit 1 \
--log_samples # Save input/output samples
```
## Best Practices
1. **Start simple**: Test with minimal config first
2. **Version your tasks**: Use `metadata.version`
3. **Document your metrics**: Explain custom metrics in comments
4. **Test with multiple models**: Ensure robustness
5. **Validate on known examples**: Include sanity checks
6. **Use filters carefully**: Can hide errors
7. **Handle edge cases**: Empty strings, missing fields
## Common Patterns
### Classification Task
```yaml
output_type: loglikelihood
doc_to_text: "Text: {{text}}\nLabel:"
doc_to_target: " {{label}}" # Space prefix important!
metric_list:
- metric: acc
aggregation: mean
```
### Perplexity Evaluation
```yaml
output_type: loglikelihood_rolling
doc_to_text: "{{text}}"
metric_list:
- metric: perplexity
aggregation: perplexity
```
### Ranking Task
```yaml
output_type: loglikelihood
doc_to_text: "Query: {{query}}\nPassage: {{passage}}\nRelevant:"
doc_to_target: [" Yes", " No"]
metric_list:
- metric: acc
aggregation: mean
```
## Troubleshooting
**"Task not found"**: Check `--include_path` and task name
**Empty results**: Verify `doc_to_text` and `doc_to_target` templates
**Metric errors**: Ensure metric names are correct (exact_match, not exact-match)
**Filter issues**: Test filters with `--log_samples`
**Python function not found**: Check `!function module.function_name` syntax
## References
- Task system: EleutherAI/lm-evaluation-harness docs
- Example tasks: `lm_eval/tasks/` directory
- TaskConfig: `lm_eval/api/task.py`
@@ -0,0 +1,519 @@
# Distributed Evaluation
Guide to running evaluation across multiple GPUs using data parallelism and tensor/pipeline parallelism.
## Overview
Distributed evaluation speeds up benchmarking by:
- **Data Parallelism**: Split evaluation samples across GPUs (each GPU has full model copy)
- **Tensor Parallelism**: Split model weights across GPUs (for large models)
- **Pipeline Parallelism**: Split model layers across GPUs (for very large models)
**When to use**:
- Data Parallel: Model fits on single GPU, want faster evaluation
- Tensor/Pipeline Parallel: Model too large for single GPU
## HuggingFace Models (`hf`)
### Data Parallelism (Recommended)
Each GPU loads a full copy of the model and processes a subset of evaluation data.
**Single Node (8 GPUs)**:
```bash
accelerate launch --multi_gpu --num_processes 8 \
-m lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-2-7b-hf,dtype=bfloat16 \
--tasks mmlu,gsm8k,hellaswag \
--batch_size 16
```
**Speedup**: Near-linear (8 GPUs = ~8× faster)
**Memory**: Each GPU needs full model (7B model ≈ 14GB × 8 = 112GB total)
### Tensor Parallelism (Model Sharding)
Split model weights across GPUs for models too large for single GPU.
**Without accelerate launcher**:
```bash
lm_eval --model hf \
--model_args \
pretrained=meta-llama/Llama-2-70b-hf,\
parallelize=True,\
dtype=bfloat16 \
--tasks mmlu,gsm8k \
--batch_size 8
```
**With 8 GPUs**: 70B model (140GB) / 8 = 17.5GB per GPU ✅
**Advanced sharding**:
```bash
lm_eval --model hf \
--model_args \
pretrained=meta-llama/Llama-2-70b-hf,\
parallelize=True,\
device_map_option=auto,\
max_memory_per_gpu=40GB,\
max_cpu_memory=100GB,\
dtype=bfloat16 \
--tasks mmlu
```
**Options**:
- `device_map_option`: `"auto"` (default), `"balanced"`, `"balanced_low_0"`
- `max_memory_per_gpu`: Max memory per GPU (e.g., `"40GB"`)
- `max_cpu_memory`: Max CPU memory for offloading
- `offload_folder`: Disk offloading directory
### Combined Data + Tensor Parallelism
Use both for very large models.
**Example: 70B model on 16 GPUs (2 copies, 8 GPUs each)**:
```bash
accelerate launch --multi_gpu --num_processes 2 \
-m lm_eval --model hf \
--model_args \
pretrained=meta-llama/Llama-2-70b-hf,\
parallelize=True,\
dtype=bfloat16 \
--tasks mmlu \
--batch_size 8
```
**Result**: 2× speedup from data parallelism, 70B model fits via tensor parallelism
### Configuration with `accelerate config`
Create `~/.cache/huggingface/accelerate/default_config.yaml`:
```yaml
compute_environment: LOCAL_MACHINE
distributed_type: MULTI_GPU
num_machines: 1
num_processes: 8
gpu_ids: all
mixed_precision: bf16
```
**Then run**:
```bash
accelerate launch -m lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-2-7b-hf \
--tasks mmlu
```
## vLLM Models (`vllm`)
vLLM provides highly optimized distributed inference.
### Tensor Parallelism
**Single Node (4 GPUs)**:
```bash
lm_eval --model vllm \
--model_args \
pretrained=meta-llama/Llama-2-70b-hf,\
tensor_parallel_size=4,\
dtype=auto,\
gpu_memory_utilization=0.9 \
--tasks mmlu,gsm8k \
--batch_size auto
```
**Memory**: 70B model split across 4 GPUs = ~35GB per GPU
### Data Parallelism
**Multiple model replicas**:
```bash
lm_eval --model vllm \
--model_args \
pretrained=meta-llama/Llama-2-7b-hf,\
data_parallel_size=4,\
dtype=auto,\
gpu_memory_utilization=0.8 \
--tasks hellaswag,arc_challenge \
--batch_size auto
```
**Result**: 4 model replicas = 4× throughput
### Combined Tensor + Data Parallelism
**Example: 8 GPUs = 4 TP × 2 DP**:
```bash
lm_eval --model vllm \
--model_args \
pretrained=meta-llama/Llama-2-70b-hf,\
tensor_parallel_size=4,\
data_parallel_size=2,\
dtype=auto,\
gpu_memory_utilization=0.85 \
--tasks mmlu \
--batch_size auto
```
**Result**: 70B model fits (TP=4), 2× speedup (DP=2)
### Multi-Node vLLM
vLLM doesn't natively support multi-node. Use Ray:
```bash
# Start Ray cluster
ray start --head --port=6379
# Run evaluation
lm_eval --model vllm \
--model_args \
pretrained=meta-llama/Llama-2-70b-hf,\
tensor_parallel_size=8,\
dtype=auto \
--tasks mmlu
```
## NVIDIA NeMo Models (`nemo_lm`)
### Data Replication
**8 replicas on 8 GPUs**:
```bash
torchrun --nproc-per-node=8 --no-python \
lm_eval --model nemo_lm \
--model_args \
path=/path/to/model.nemo,\
devices=8 \
--tasks hellaswag,arc_challenge \
--batch_size 32
```
**Speedup**: Near-linear (8× faster)
### Tensor Parallelism
**4-way tensor parallelism**:
```bash
torchrun --nproc-per-node=4 --no-python \
lm_eval --model nemo_lm \
--model_args \
path=/path/to/70b_model.nemo,\
devices=4,\
tensor_model_parallel_size=4 \
--tasks mmlu,gsm8k \
--batch_size 16
```
### Pipeline Parallelism
**2 TP × 2 PP on 4 GPUs**:
```bash
torchrun --nproc-per-node=4 --no-python \
lm_eval --model nemo_lm \
--model_args \
path=/path/to/model.nemo,\
devices=4,\
tensor_model_parallel_size=2,\
pipeline_model_parallel_size=2 \
--tasks mmlu \
--batch_size 8
```
**Constraint**: `devices = TP × PP`
### Multi-Node NeMo
Currently not supported by lm-evaluation-harness.
## SGLang Models (`sglang`)
### Tensor Parallelism
```bash
lm_eval --model sglang \
--model_args \
pretrained=meta-llama/Llama-2-70b-hf,\
tp_size=4,\
dtype=auto \
--tasks gsm8k \
--batch_size auto
```
### Data Parallelism (Deprecated)
**Note**: SGLang is deprecating data parallelism. Use tensor parallelism instead.
```bash
lm_eval --model sglang \
--model_args \
pretrained=meta-llama/Llama-2-7b-hf,\
dp_size=4,\
dtype=auto \
--tasks mmlu
```
## Performance Comparison
### 70B Model Evaluation (MMLU, 5-shot)
| Method | GPUs | Time | Memory/GPU | Notes |
|--------|------|------|------------|-------|
| HF (no parallel) | 1 | 8 hours | 140GB (OOM) | Won't fit |
| HF (TP=8) | 8 | 2 hours | 17.5GB | Slower, fits |
| HF (DP=8) | 8 | 1 hour | 140GB (OOM) | Won't fit |
| vLLM (TP=4) | 4 | 30 min | 35GB | Fast! |
| vLLM (TP=4, DP=2) | 8 | 15 min | 35GB | Fastest |
### 7B Model Evaluation (Multiple Tasks)
| Method | GPUs | Time | Speedup |
|--------|------|------|---------|
| HF (single) | 1 | 4 hours | 1× |
| HF (DP=4) | 4 | 1 hour | 4× |
| HF (DP=8) | 8 | 30 min | 8× |
| vLLM (DP=8) | 8 | 15 min | 16× |
**Takeaway**: vLLM is significantly faster than HuggingFace for inference.
## Choosing Parallelism Strategy
### Decision Tree
```
Model fits on single GPU?
├─ YES: Use data parallelism
│ ├─ HF: accelerate launch --multi_gpu --num_processes N
│ └─ vLLM: data_parallel_size=N (fastest)
└─ NO: Use tensor/pipeline parallelism
├─ Model < 70B:
│ └─ vLLM: tensor_parallel_size=4
├─ Model 70-175B:
│ ├─ vLLM: tensor_parallel_size=8
│ └─ Or HF: parallelize=True
└─ Model > 175B:
└─ Contact framework authors
```
### Memory Estimation
**Rule of thumb**:
```
Memory (GB) = Parameters (B) × Precision (bytes) × 1.2 (overhead)
```
**Examples**:
- 7B FP16: 7 × 2 × 1.2 = 16.8GB ✅ Fits A100 40GB
- 13B FP16: 13 × 2 × 1.2 = 31.2GB ✅ Fits A100 40GB
- 70B FP16: 70 × 2 × 1.2 = 168GB ❌ Need TP=4 or TP=8
- 70B BF16: 70 × 2 × 1.2 = 168GB (same as FP16)
**With tensor parallelism**:
```
Memory per GPU = Total Memory / TP
```
- 70B on 4 GPUs: 168GB / 4 = 42GB per GPU ✅
- 70B on 8 GPUs: 168GB / 8 = 21GB per GPU ✅
## Multi-Node Evaluation
### HuggingFace with SLURM
**Submit job**:
```bash
#!/bin/bash
#SBATCH --nodes=4
#SBATCH --gpus-per-node=8
#SBATCH --ntasks-per-node=1
srun accelerate launch --multi_gpu \
--num_processes $((SLURM_NNODES * 8)) \
-m lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-2-7b-hf \
--tasks mmlu,gsm8k,hellaswag \
--batch_size 16
```
**Submit**:
```bash
sbatch eval_job.sh
```
### Manual Multi-Node Setup
**On each node, run**:
```bash
accelerate launch \
--multi_gpu \
--num_machines 4 \
--num_processes 32 \
--main_process_ip $MASTER_IP \
--main_process_port 29500 \
--machine_rank $NODE_RANK \
-m lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-2-7b-hf \
--tasks mmlu
```
**Environment variables**:
- `MASTER_IP`: IP of rank 0 node
- `NODE_RANK`: 0, 1, 2, 3 for each node
## Best Practices
### 1. Start Small
Test on small sample first:
```bash
lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-2-70b-hf,parallelize=True \
--tasks mmlu \
--limit 100 # Just 100 samples
```
### 2. Monitor GPU Usage
```bash
# Terminal 1: Run evaluation
lm_eval --model hf ...
# Terminal 2: Monitor
watch -n 1 nvidia-smi
```
Look for:
- GPU utilization > 90%
- Memory usage stable
- All GPUs active
### 3. Optimize Batch Size
```bash
# Auto batch size (recommended)
--batch_size auto
# Or tune manually
--batch_size 16 # Start here
--batch_size 32 # Increase if memory allows
```
### 4. Use Mixed Precision
```bash
--model_args dtype=bfloat16 # Faster, less memory
```
### 5. Check Communication
For data parallelism, check network bandwidth:
```bash
# Should see InfiniBand or high-speed network
nvidia-smi topo -m
```
## Troubleshooting
### "CUDA out of memory"
**Solutions**:
1. Increase tensor parallelism:
```bash
--model_args tensor_parallel_size=8 # Was 4
```
2. Reduce batch size:
```bash
--batch_size 4 # Was 16
```
3. Lower precision:
```bash
--model_args dtype=int8 # Quantization
```
### "NCCL error" or Hanging
**Check**:
1. All GPUs visible: `nvidia-smi`
2. NCCL installed: `python -c "import torch; print(torch.cuda.nccl.version())"`
3. Network connectivity between nodes
**Fix**:
```bash
export NCCL_DEBUG=INFO # Enable debug logging
export NCCL_IB_DISABLE=0 # Use InfiniBand if available
```
### Slow Evaluation
**Possible causes**:
1. **Data loading bottleneck**: Preprocess dataset
2. **Low GPU utilization**: Increase batch size
3. **Communication overhead**: Reduce parallelism degree
**Profile**:
```bash
lm_eval --model hf \
--model_args pretrained=meta-llama/Llama-2-7b-hf \
--tasks mmlu \
--limit 100 \
--log_samples # Check timing
```
### GPUs Imbalanced
**Symptom**: GPU 0 at 100%, others at 50%
**Solution**: Use `device_map_option=balanced`:
```bash
--model_args parallelize=True,device_map_option=balanced
```
## Example Configurations
### Small Model (7B) - Fast Evaluation
```bash
# 8 A100s, data parallel
accelerate launch --multi_gpu --num_processes 8 \
-m lm_eval --model hf \
--model_args \
pretrained=meta-llama/Llama-2-7b-hf,\
dtype=bfloat16 \
--tasks mmlu,gsm8k,hellaswag,arc_challenge \
--num_fewshot 5 \
--batch_size 32
# Time: ~30 minutes
```
### Large Model (70B) - vLLM
```bash
# 8 H100s, tensor parallel
lm_eval --model vllm \
--model_args \
pretrained=meta-llama/Llama-2-70b-hf,\
tensor_parallel_size=8,\
dtype=auto,\
gpu_memory_utilization=0.9 \
--tasks mmlu,gsm8k,humaneval \
--num_fewshot 5 \
--batch_size auto
# Time: ~1 hour
```
### Very Large Model (175B+)
**Requires specialized setup - contact framework maintainers**
## References
- HuggingFace Accelerate: https://huggingface.co/docs/accelerate/
- vLLM docs: https://docs.vllm.ai/
- NeMo docs: https://docs.nvidia.com/nemo-framework/
- lm-eval distributed guide: `docs/model_guide.md`
@@ -1,6 +1,6 @@
---
name: huggingface-hub
description: "HuggingFace hf CLI: search/download/upload models, datasets."
description: Hugging Face Hub CLI (hf) — search, download, and upload models and datasets, manage repos, query datasets with SQL, deploy inference endpoints, manage Spaces and buckets.
version: 1.0.0
author: Hugging Face
license: MIT
@@ -1,6 +1,6 @@
---
name: outlines
description: "Outlines: structured JSON/regex/Pydantic LLM generation."
description: Guarantee valid JSON/XML/code structure during generation, use Pydantic models for type-safe outputs, support local models (Transformers, vLLM), and maximize inference speed with Outlines - dottxt.ai's structured generation library
version: 1.0.0
author: Orchestra Research
license: MIT
@@ -1,6 +1,6 @@
---
name: segment-anything-model
description: "SAM: zero-shot image segmentation via points, boxes, masks."
description: Foundation model for image segmentation with zero-shot transfer. Use when you need to segment any object in images using points, boxes, or masks as prompts, or automatically generate all object masks in an image.
version: 1.0.0
author: Orchestra Research
license: MIT
@@ -1,6 +1,6 @@
---
name: fine-tuning-with-trl
description: "TRL: SFT, DPO, PPO, GRPO, reward modeling for LLM RLHF."
description: Fine-tune LLMs using reinforcement learning with TRL - SFT for instruction tuning, DPO for preference alignment, PPO/GRPO for reward optimization, and reward model training. Use when need RLHF, align model with preferences, or train from human feedback. Works with HuggingFace Transformers.
version: 1.0.0
author: Orchestra Research
license: MIT
@@ -1,6 +1,6 @@
---
name: unsloth
description: "Unsloth: 2-5x faster LoRA/QLoRA fine-tuning, less VRAM."
description: Expert guidance for fast fine-tuning with Unsloth - 2-5x faster training, 50-80% less memory, LoRA/QLoRA optimization
version: 1.0.0
author: Orchestra Research
license: MIT
@@ -1,6 +1,6 @@
---
name: linear
description: "Linear: manage issues, projects, teams via GraphQL + curl."
description: Manage Linear issues, projects, and teams via the GraphQL API. Create, update, search, and organize issues. Uses API key auth (no OAuth needed). All operations via curl — no dependencies.
version: 1.0.0
author: Hermes Agent
license: MIT
@@ -1,6 +1,11 @@
---
name: maps
description: "Geocode, POIs, routes, timezones via OpenStreetMap/OSRM."
description: >
Location intelligence — geocode a place, reverse-geocode coordinates,
find nearby places (46 POI categories), driving/walking/cycling
distance + time, turn-by-turn directions, timezone lookup, bounding
box + area for a named place, and POI search within a rectangle.
Uses OpenStreetMap + Overpass + OSRM. Free, no API key.
version: 1.2.0
author: Mibayy
license: MIT
@@ -926,18 +926,13 @@ def cmd_timezone(args):
os_ = offset_info.get("seconds", 0)
sign = "+" if oh >= 0 else "-"
utc_offset = f"{sign}{abs(oh):02d}:{om:02d}"
if os_:
utc_offset = f"{utc_offset}:{os_:02d}"
elif tz_data.get("standardUtcOffset"):
offset_info2 = tz_data["standardUtcOffset"]
if isinstance(offset_info2, dict):
oh = offset_info2.get("hours", 0)
om = abs(offset_info2.get("minutes", 0))
os_ = offset_info2.get("seconds", 0)
sign = "+" if oh >= 0 else "-"
utc_offset = f"{sign}{abs(oh):02d}:{om:02d}"
if os_:
utc_offset = f"{utc_offset}:{os_:02d}"
timezone_src = "timeapi.io"
except (RuntimeError, KeyError, TypeError):
pass # API may be down; continue to fallback
@@ -1,6 +1,6 @@
---
name: nano-pdf
description: "Edit PDF text/typos/titles via nano-pdf CLI (NL prompts)."
description: Edit PDFs with natural-language instructions using the nano-pdf CLI. Modify text, fix typos, update titles, and make content changes to specific pages without manual editing.
version: 1.0.0
author: community
license: MIT
@@ -1,6 +1,6 @@
---
name: notion
description: "Notion API via curl: pages, databases, blocks, search."
description: Notion API for creating and managing pages, databases, and blocks via curl. Search, create, update, and query Notion workspaces directly from the terminal.
version: 1.0.0
author: community
license: MIT
@@ -1,15 +1,11 @@
---
name: powerpoint
description: "Create, read, edit .pptx decks, slides, notes, templates."
description: "Use this skill any time a .pptx file is involved in any way — as input, output, or both. This includes: creating slide decks, pitch decks, or presentations; reading, parsing, or extracting text from any .pptx file (even if the extracted content will be used elsewhere, like in an email or summary); editing, modifying, or updating existing presentations; combining or splitting slide files; working with templates, layouts, speaker notes, or comments. Trigger whenever the user mentions \"deck,\" \"slides,\" \"presentation,\" or references a .pptx filename, regardless of what they plan to do with the content afterward. If a .pptx file needs to be opened, created, or touched, use this skill."
license: Proprietary. LICENSE.txt has complete terms
---
# Powerpoint Skill
## When to use
Use this skill any time a .pptx file is involved in any way — as input, output, or both. This includes: creating slide decks, pitch decks, or presentations; reading, parsing, or extracting text from any .pptx file (even if the extracted content will be used elsewhere, like in an email or summary); editing, modifying, or updating existing presentations; combining or splitting slide files; working with templates, layouts, speaker notes, or comments. Trigger whenever the user mentions "deck," "slides," "presentation," or references a .pptx filename, regardless of what they plan to do with the content afterward. If a .pptx file needs to be opened, created, or touched, use this skill.
## Quick Reference
| Task | Guide |
@@ -1,6 +1,6 @@
---
name: plan
description: "Plan mode: write markdown plan to .hermes/plans/, no exec."
description: Plan mode for Hermes — inspect context, write a markdown plan into the active workspace's `.hermes/plans/` directory, and do not execute the work.
version: 1.0.0
author: Hermes Agent
license: MIT
@@ -1,6 +1,9 @@
---
name: requesting-code-review
description: "Pre-commit review: security scan, quality gates, auto-fix."
description: >
Pre-commit verification pipeline — static security scan, baseline-aware
quality gates, independent reviewer subagent, and auto-fix loop. Use after
code changes and before committing, pushing, or opening a PR.
version: 2.0.0
author: Hermes Agent (adapted from obra/superpowers + MorAlekss)
license: MIT
@@ -1,6 +1,6 @@
---
name: subagent-driven-development
description: "Execute plans via delegate_task subagents (2-stage review)."
description: Use when executing implementation plans with independent tasks. Dispatches fresh delegate_task per task with two-stage review (spec compliance then code quality).
version: 1.1.0
author: Hermes Agent (adapted from obra/superpowers)
license: MIT
@@ -340,12 +340,3 @@ Catch issues early
```
**Quality is not an accident. It's the result of systematic process.**
## Further reading (load when relevant)
When the orchestration involves significant context usage, long review loops, or complex validation checkpoints, load these references for the specific discipline:
- **`references/context-budget-discipline.md`** — Four-tier context degradation model (PEAK / GOOD / DEGRADING / POOR), read-depth rules that scale with context window size, and early warning signs of silent degradation. Load when a run will clearly consume significant context (multi-phase plans, many subagents, large artifacts).
- **`references/gates-taxonomy.md`** — The four canonical gate types (Pre-flight, Revision, Escalation, Abort) with behavior, recovery, and examples. Load when designing or reviewing any workflow that has validation checkpoints — use the vocabulary explicitly so each gate has defined entry, failure behavior, and resumption rules.
Both references adapted from gsd-build/get-shit-done (MIT © 2025 Lex Christopherson).
@@ -1,53 +0,0 @@
# Context Budget Discipline
Practical rules for keeping orchestrator context lean when spawning subagents or reading large artifacts. Use these whenever you're running a multi-step agent loop that will consume significant context — plan execution, subagent orchestration, review pipelines, multi-file refactors.
Adapted from the GSD (Get Shit Done) project's context-budget reference — MIT © 2025 Lex Christopherson ([gsd-build/get-shit-done](https://github.com/gsd-build/get-shit-done)).
## Universal rules
Every workflow that spawns agents or reads significant content must follow these:
1. **Never read agent definition files.** `delegate_task` auto-loads them — you reading them too just doubles the cost.
2. **Never inline large files into subagent prompts.** Tell the agent to read the file from disk with `read_file` instead. The subagent gets full content; your context stays lean.
3. **Read depth scales with context window.** See the table below.
4. **Delegate heavy work to subagents.** The orchestrator routes; it doesn't execute.
5. **Proactively warn** the user when you've consumed significant context ("Context is getting heavy — consider checkpointing progress before we continue").
## Read depth by context window
Check the model's actual context window (not "it's Claude so 200K"). Some Sonnet deployments are 1M, some are 200K. If you don't know, assume the smaller one — err toward leanness.
| Context window | Subagent output reading | Summary files | Verification files | Plans for other phases |
|----------------|-------------------------|---------------|--------------------|-----------------------|
| < 500k (e.g. 200k) | Frontmatter only | Frontmatter only | Frontmatter only | Current phase only |
| >= 500k (1M models) | Full body permitted | Full body permitted | Full body permitted | Current phase only |
"Frontmatter only" means: read enough to see the final status/verdict/conclusion. If the subagent wrote a 3000-line debug log, read the summary section it produced, not the log.
## Four-tier degradation model
Monitor your context usage and shift behavior as you climb the tiers. The point is to notice *before* you hit the wall, not when responses start truncating.
| Tier | Usage | Behavior |
|------|-------|----------|
| **PEAK** | 0 30% | Full operations. Read bodies, spawn multiple agents in parallel, inline results freely. |
| **GOOD** | 30 50% | Normal operations. Prefer frontmatter reads. Delegate aggressively. |
| **DEGRADING** | 50 70% | Economize. Frontmatter-only reads, minimal inlining, **warn the user** about budget. |
| **POOR** | 70%+ | Emergency mode. **Checkpoint progress immediately.** No new reads unless critical. Finish the current task and stop cleanly. |
## Early warning signs (before panic thresholds fire)
Quality degrades *gradually* before hard limits hit. Watch for these:
- **Silent partial completion.** Subagent claims done but implementation is incomplete. Self-checks catch file existence, not semantic completeness. Always verify subagent output against the plan's must-haves, not just "did a file appear?"
- **Increasing vagueness.** Agent starts using phrases like "appropriate handling" or "standard patterns" instead of specific code. This is context pressure showing up before budget warnings fire.
- **Skipped protocol steps.** Agent omits steps it would normally follow. If success criteria has 8 items and the report covers 5, suspect context pressure, not "the agent decided 5 was enough."
When these signs appear, checkpoint the work and either reset context or hand off to a fresh subagent.
## Fundamental limitation
When you orchestrate, you cannot verify semantic correctness of subagent output — only structural completeness ("did the file appear?", "does the test pass?"). Semantic verification requires either running the code yourself or delegating a review pass to another fresh subagent.
**Mitigation:** in every task you delegate, include explicit "must-have" truths the subagent must confirm in its response (e.g., "confirm your test actually tests X, not just that X was imported"). The subagent re-asserting concrete facts is evidence; vague summaries are not.
@@ -1,93 +0,0 @@
# Gates Taxonomy
Canonical gate types for validation checkpoints across any workflow that spawns subagents, runs review loops, or has human-approval pauses. Every validation checkpoint maps to one of these four types — naming them explicitly makes the workflow legible and prevents "what happens when this check fails?" confusion.
Adapted from the GSD (Get Shit Done) project's gates reference — MIT © 2025 Lex Christopherson ([gsd-build/get-shit-done](https://github.com/gsd-build/get-shit-done)).
## The four gate types
### 1. Pre-flight gate
**Purpose:** Validates preconditions before starting an operation.
**Behavior:** Blocks entry if conditions unmet. No partial work created — bail before anything changes.
**Recovery:** Fix the missing precondition, then retry.
**Examples:**
- Implementation phase checks that the plan file exists before it starts writing code.
- Delegated subagent checks that required env vars are set before making API calls.
- Commit checks that tests passed before pushing.
### 2. Revision gate
**Purpose:** Evaluates output quality and routes to revision if insufficient.
**Behavior:** Loops back to the producer with specific feedback. Bounded by an iteration cap (typically 3).
**Recovery:** Producer addresses feedback; checker re-evaluates. The loop escalates early if issue count does not decrease between consecutive iterations (stall detection). After max iterations, escalates to the user unconditionally — never loop forever.
**Examples:**
- Plan reviewer reads a draft plan, returns specific issues, planner revises, reviewer re-reads (max 3 cycles).
- Code reviewer checks subagent-produced code against must-haves; dispatches fixes back to the implementer if any must-have failed.
- Test coverage checker validates new tests exercise the new paths; if not, sends back to author.
### 3. Escalation gate
**Purpose:** Surfaces unresolvable issues to the human for a decision.
**Behavior:** Pauses workflow, presents options, waits for human input. Never guesses, never picks a default.
**Recovery:** Human chooses action; workflow resumes on the selected path.
**Examples:**
- Revision loop exhausted after 3 iterations.
- Merge conflict during automated worktree cleanup.
- Ambiguous requirement — two reasonable interpretations and the choice changes the approach.
- Subagent reports "the plan says X but the codebase actually does Y" — human decides which is right.
### 4. Abort gate
**Purpose:** Terminates the operation to prevent damage or waste.
**Behavior:** Stops immediately, preserves state (checkpoint current progress), reports the specific reason.
**Recovery:** Human investigates root cause, fixes, restarts from checkpoint.
**Examples:**
- Context window critically low during execution (POOR tier, >70%) — abort cleanly rather than produce truncated output.
- Critical dependency unavailable mid-run (network down, API key revoked).
- Unrecoverable filesystem state (disk full, permissions lost).
- Safety invariant violated (agent attempted an irreversible destructive action outside approved scope).
## How to use this in a skill
When you write an orchestration skill that has validation checkpoints, **name each checkpoint by its gate type explicitly** and answer three questions:
1. **What condition triggers this gate?** (e.g., "plan file missing", "issue count didn't decrease", "context >70%")
2. **What happens when it fails?** (block / loop back / ask human / abort)
3. **Who resumes, and from where?** (fix precondition + retry, revise + re-check, human decision, restart from checkpoint)
Answering these three up front means your skill never hits "what do we do now?" at runtime.
## Example — a review loop with all four gate types
```
[Pre-flight] plan.md exists and is non-empty? → no: bail, ask user to write a plan first
↓ yes
[Execute] subagent implements task
[Revision] reviewer checks against must-haves → fail: loop back to subagent (max 3)
↓ pass
[Pre-flight] tests pass? → no: bail, report failing tests
↓ yes
[Commit]
(on revision loop exhaustion)
[Escalation] "3 review cycles failed to converge on issue X — pick: force-merge, rewrite task, abandon"
↓ user picks
(on any tier-POOR context pressure during loop)
[Abort] "context at 73%, checkpointing and stopping"
```
The vocabulary is small on purpose. Every gate in every workflow should fit one of these four. If you find yourself inventing a fifth, it's probably a revision gate with extra branching, or an escalation gate in disguise.
@@ -1,6 +1,6 @@
---
name: systematic-debugging
description: "4-phase root cause debugging: understand bugs before fixing."
description: Use when encountering any bug, test failure, or unexpected behavior. 4-phase root cause investigation — NO fixes without understanding the problem first.
version: 1.1.0
author: Hermes Agent (adapted from obra/superpowers)
license: MIT
@@ -1,6 +1,6 @@
---
name: test-driven-development
description: "TDD: enforce RED-GREEN-REFACTOR, tests before code."
description: Use when implementing any feature or bugfix, before writing implementation code. Enforces RED-GREEN-REFACTOR cycle with test-first approach.
version: 1.1.0
author: Hermes Agent (adapted from obra/superpowers)
license: MIT
@@ -1,6 +1,6 @@
---
name: writing-plans
description: "Write implementation plans: bite-sized tasks, paths, code."
description: Use when you have a spec or requirements for a multi-step task. Creates comprehensive implementation plans with bite-sized tasks, exact file paths, and complete code examples.
version: 1.1.0
author: Hermes Agent (adapted from obra/superpowers)
license: MIT
@@ -1 +0,0 @@
ref: refs/heads/current
@@ -1,9 +0,0 @@
[core]
repositoryformatversion = 0
filemode = true
bare = true
ignorecase = true
autocrlf = false
[user]
name = Agent Zero Time Travel
email = time-travel@agent-zero.local
@@ -1 +0,0 @@
Unnamed repository; edit this file 'description' to name the repository.
@@ -1,15 +0,0 @@
#!/bin/sh
#
# An example hook script to check the commit log message taken by
# applypatch from an e-mail message.
#
# The hook should exit with non-zero status after issuing an
# appropriate message if it wants to stop the commit. The hook is
# allowed to edit the commit message file.
#
# To enable this hook, rename this file to "applypatch-msg".
. git-sh-setup
commitmsg="$(git rev-parse --git-path hooks/commit-msg)"
test -x "$commitmsg" && exec "$commitmsg" ${1+"$@"}
:
@@ -1,24 +0,0 @@
#!/bin/sh
#
# An example hook script to check the commit log message.
# Called by "git commit" with one argument, the name of the file
# that has the commit message. The hook should exit with non-zero
# status after issuing an appropriate message if it wants to stop the
# commit. The hook is allowed to edit the commit message file.
#
# To enable this hook, rename this file to "commit-msg".
# Uncomment the below to add a Signed-off-by line to the message.
# Doing this in a hook is a bad idea in general, but the prepare-commit-msg
# hook is more suited to it.
#
# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')
# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1"
# This example catches duplicate Signed-off-by lines.
test "" = "$(grep '^Signed-off-by: ' "$1" |
sort | uniq -c | sed -e '/^[ ]*1[ ]/d')" || {
echo >&2 Duplicate Signed-off-by lines.
exit 1
}
@@ -1,174 +0,0 @@
#!/usr/bin/perl
use strict;
use warnings;
use IPC::Open2;
# An example hook script to integrate Watchman
# (https://facebook.github.io/watchman/) with git to speed up detecting
# new and modified files.
#
# The hook is passed a version (currently 2) and last update token
# formatted as a string and outputs to stdout a new update token and
# all files that have been modified since the update token. Paths must
# be relative to the root of the working tree and separated by a single NUL.
#
# To enable this hook, rename this file to "query-watchman" and set
# 'git config core.fsmonitor .git/hooks/query-watchman'
#
my ($version, $last_update_token) = @ARGV;
# Uncomment for debugging
# print STDERR "$0 $version $last_update_token\n";
# Check the hook interface version
if ($version ne 2) {
die "Unsupported query-fsmonitor hook version '$version'.\n" .
"Falling back to scanning...\n";
}
my $git_work_tree = get_working_dir();
my $retry = 1;
my $json_pkg;
eval {
require JSON::XS;
$json_pkg = "JSON::XS";
1;
} or do {
require JSON::PP;
$json_pkg = "JSON::PP";
};
launch_watchman();
sub launch_watchman {
my $o = watchman_query();
if (is_work_tree_watched($o)) {
output_result($o->{clock}, @{$o->{files}});
}
}
sub output_result {
my ($clockid, @files) = @_;
# Uncomment for debugging watchman output
# open (my $fh, ">", ".git/watchman-output.out");
# binmode $fh, ":utf8";
# print $fh "$clockid\n@files\n";
# close $fh;
binmode STDOUT, ":utf8";
print $clockid;
print "\0";
local $, = "\0";
print @files;
}
sub watchman_clock {
my $response = qx/watchman clock "$git_work_tree"/;
die "Failed to get clock id on '$git_work_tree'.\n" .
"Falling back to scanning...\n" if $? != 0;
return $json_pkg->new->utf8->decode($response);
}
sub watchman_query {
my $pid = open2(\*CHLD_OUT, \*CHLD_IN, 'watchman -j --no-pretty')
or die "open2() failed: $!\n" .
"Falling back to scanning...\n";
# In the query expression below we're asking for names of files that
# changed since $last_update_token but not from the .git folder.
#
# To accomplish this, we're using the "since" generator to use the
# recency index to select candidate nodes and "fields" to limit the
# output to file names only. Then we're using the "expression" term to
# further constrain the results.
my $last_update_line = "";
if (substr($last_update_token, 0, 1) eq "c") {
$last_update_token = "\"$last_update_token\"";
$last_update_line = qq[\n"since": $last_update_token,];
}
my $query = <<" END";
["query", "$git_work_tree", {$last_update_line
"fields": ["name"],
"expression": ["not", ["dirname", ".git"]]
}]
END
# Uncomment for debugging the watchman query
# open (my $fh, ">", ".git/watchman-query.json");
# print $fh $query;
# close $fh;
print CHLD_IN $query;
close CHLD_IN;
my $response = do {local $/; <CHLD_OUT>};
# Uncomment for debugging the watch response
# open ($fh, ">", ".git/watchman-response.json");
# print $fh $response;
# close $fh;
die "Watchman: command returned no output.\n" .
"Falling back to scanning...\n" if $response eq "";
die "Watchman: command returned invalid output: $response\n" .
"Falling back to scanning...\n" unless $response =~ /^\{/;
return $json_pkg->new->utf8->decode($response);
}
sub is_work_tree_watched {
my ($output) = @_;
my $error = $output->{error};
if ($retry > 0 and $error and $error =~ m/unable to resolve root .* directory (.*) is not watched/) {
$retry--;
my $response = qx/watchman watch "$git_work_tree"/;
die "Failed to make watchman watch '$git_work_tree'.\n" .
"Falling back to scanning...\n" if $? != 0;
$output = $json_pkg->new->utf8->decode($response);
$error = $output->{error};
die "Watchman: $error.\n" .
"Falling back to scanning...\n" if $error;
# Uncomment for debugging watchman output
# open (my $fh, ">", ".git/watchman-output.out");
# close $fh;
# Watchman will always return all files on the first query so
# return the fast "everything is dirty" flag to git and do the
# Watchman query just to get it over with now so we won't pay
# the cost in git to look up each individual file.
my $o = watchman_clock();
$error = $output->{error};
die "Watchman: $error.\n" .
"Falling back to scanning...\n" if $error;
output_result($o->{clock}, ("/"));
$last_update_token = $o->{clock};
eval { launch_watchman() };
return 0;
}
die "Watchman: $error.\n" .
"Falling back to scanning...\n" if $error;
return 1;
}
sub get_working_dir {
my $working_dir;
if ($^O =~ 'msys' || $^O =~ 'cygwin') {
$working_dir = Win32::GetCwd();
$working_dir =~ tr/\\/\//;
} else {
require Cwd;
$working_dir = Cwd::cwd();
}
return $working_dir;
}
@@ -1,8 +0,0 @@
#!/bin/sh
#
# An example hook script to prepare a packed repository for use over
# dumb transports.
#
# To enable this hook, rename this file to "post-update".
exec git update-server-info
@@ -1,14 +0,0 @@
#!/bin/sh
#
# An example hook script to verify what is about to be committed
# by applypatch from an e-mail message.
#
# The hook should exit with non-zero status after issuing an
# appropriate message if it wants to stop the commit.
#
# To enable this hook, rename this file to "pre-applypatch".
. git-sh-setup
precommit="$(git rev-parse --git-path hooks/pre-commit)"
test -x "$precommit" && exec "$precommit" ${1+"$@"}
:
@@ -1,49 +0,0 @@
#!/bin/sh
#
# An example hook script to verify what is about to be committed.
# Called by "git commit" with no arguments. The hook should
# exit with non-zero status after issuing an appropriate message if
# it wants to stop the commit.
#
# To enable this hook, rename this file to "pre-commit".
if git rev-parse --verify HEAD >/dev/null 2>&1
then
against=HEAD
else
# Initial commit: diff against an empty tree object
against=$(git hash-object -t tree /dev/null)
fi
# If you want to allow non-ASCII filenames set this variable to true.
allownonascii=$(git config --type=bool hooks.allownonascii)
# Redirect output to stderr.
exec 1>&2
# Cross platform projects tend to avoid non-ASCII filenames; prevent
# them from being added to the repository. We exploit the fact that the
# printable range starts at the space character and ends with tilde.
if [ "$allownonascii" != "true" ] &&
# Note that the use of brackets around a tr range is ok here, (it's
# even required, for portability to Solaris 10's /usr/bin/tr), since
# the square bracket bytes happen to fall in the designated range.
test $(git diff-index --cached --name-only --diff-filter=A -z $against |
LC_ALL=C tr -d '[ -~]\0' | wc -c) != 0
then
cat <<\EOF
Error: Attempt to add a non-ASCII file name.
This can cause problems if you want to work with people on other platforms.
To be portable it is advisable to rename the file.
If you know what you are doing you can disable this check using:
git config hooks.allownonascii true
EOF
exit 1
fi
# If there are whitespace errors, print the offending file names and fail.
exec git diff-index --check --cached $against --
@@ -1,13 +0,0 @@
#!/bin/sh
#
# An example hook script to verify what is about to be committed.
# Called by "git merge" with no arguments. The hook should
# exit with non-zero status after issuing an appropriate message to
# stderr if it wants to stop the merge commit.
#
# To enable this hook, rename this file to "pre-merge-commit".
. git-sh-setup
test -x "$GIT_DIR/hooks/pre-commit" &&
exec "$GIT_DIR/hooks/pre-commit"
:
@@ -1,53 +0,0 @@
#!/bin/sh
# An example hook script to verify what is about to be pushed. Called by "git
# push" after it has checked the remote status, but before anything has been
# pushed. If this script exits with a non-zero status nothing will be pushed.
#
# This hook is called with the following parameters:
#
# $1 -- Name of the remote to which the push is being done
# $2 -- URL to which the push is being done
#
# If pushing without using a named remote those arguments will be equal.
#
# Information about the commits which are being pushed is supplied as lines to
# the standard input in the form:
#
# <local ref> <local oid> <remote ref> <remote oid>
#
# This sample shows how to prevent push of commits where the log message starts
# with "WIP" (work in progress).
remote="$1"
url="$2"
zero=$(git hash-object --stdin </dev/null | tr '[0-9a-f]' '0')
while read local_ref local_oid remote_ref remote_oid
do
if test "$local_oid" = "$zero"
then
# Handle delete
:
else
if test "$remote_oid" = "$zero"
then
# New branch, examine all commits
range="$local_oid"
else
# Update to existing branch, examine new commits
range="$remote_oid..$local_oid"
fi
# Check for WIP commit
commit=$(git rev-list -n 1 --grep '^WIP' "$range")
if test -n "$commit"
then
echo >&2 "Found WIP commit in $local_ref, not pushing"
exit 1
fi
fi
done
exit 0
@@ -1,169 +0,0 @@
#!/bin/sh
#
# Copyright (c) 2006, 2008 Junio C Hamano
#
# The "pre-rebase" hook is run just before "git rebase" starts doing
# its job, and can prevent the command from running by exiting with
# non-zero status.
#
# The hook is called with the following parameters:
#
# $1 -- the upstream the series was forked from.
# $2 -- the branch being rebased (or empty when rebasing the current branch).
#
# This sample shows how to prevent topic branches that are already
# merged to 'next' branch from getting rebased, because allowing it
# would result in rebasing already published history.
publish=next
basebranch="$1"
if test "$#" = 2
then
topic="refs/heads/$2"
else
topic=`git symbolic-ref HEAD` ||
exit 0 ;# we do not interrupt rebasing detached HEAD
fi
case "$topic" in
refs/heads/??/*)
;;
*)
exit 0 ;# we do not interrupt others.
;;
esac
# Now we are dealing with a topic branch being rebased
# on top of master. Is it OK to rebase it?
# Does the topic really exist?
git show-ref -q "$topic" || {
echo >&2 "No such branch $topic"
exit 1
}
# Is topic fully merged to master?
not_in_master=`git rev-list --pretty=oneline ^master "$topic"`
if test -z "$not_in_master"
then
echo >&2 "$topic is fully merged to master; better remove it."
exit 1 ;# we could allow it, but there is no point.
fi
# Is topic ever merged to next? If so you should not be rebasing it.
only_next_1=`git rev-list ^master "^$topic" ${publish} | sort`
only_next_2=`git rev-list ^master ${publish} | sort`
if test "$only_next_1" = "$only_next_2"
then
not_in_topic=`git rev-list "^$topic" master`
if test -z "$not_in_topic"
then
echo >&2 "$topic is already up to date with master"
exit 1 ;# we could allow it, but there is no point.
else
exit 0
fi
else
not_in_next=`git rev-list --pretty=oneline ^${publish} "$topic"`
/usr/bin/perl -e '
my $topic = $ARGV[0];
my $msg = "* $topic has commits already merged to public branch:\n";
my (%not_in_next) = map {
/^([0-9a-f]+) /;
($1 => 1);
} split(/\n/, $ARGV[1]);
for my $elem (map {
/^([0-9a-f]+) (.*)$/;
[$1 => $2];
} split(/\n/, $ARGV[2])) {
if (!exists $not_in_next{$elem->[0]}) {
if ($msg) {
print STDERR $msg;
undef $msg;
}
print STDERR " $elem->[1]\n";
}
}
' "$topic" "$not_in_next" "$not_in_master"
exit 1
fi
<<\DOC_END
This sample hook safeguards topic branches that have been
published from being rewound.
The workflow assumed here is:
* Once a topic branch forks from "master", "master" is never
merged into it again (either directly or indirectly).
* Once a topic branch is fully cooked and merged into "master",
it is deleted. If you need to build on top of it to correct
earlier mistakes, a new topic branch is created by forking at
the tip of the "master". This is not strictly necessary, but
it makes it easier to keep your history simple.
* Whenever you need to test or publish your changes to topic
branches, merge them into "next" branch.
The script, being an example, hardcodes the publish branch name
to be "next", but it is trivial to make it configurable via
$GIT_DIR/config mechanism.
With this workflow, you would want to know:
(1) ... if a topic branch has ever been merged to "next". Young
topic branches can have stupid mistakes you would rather
clean up before publishing, and things that have not been
merged into other branches can be easily rebased without
affecting other people. But once it is published, you would
not want to rewind it.
(2) ... if a topic branch has been fully merged to "master".
Then you can delete it. More importantly, you should not
build on top of it -- other people may already want to
change things related to the topic as patches against your
"master", so if you need further changes, it is better to
fork the topic (perhaps with the same name) afresh from the
tip of "master".
Let's look at this example:
o---o---o---o---o---o---o---o---o---o "next"
/ / / /
/ a---a---b A / /
/ / / /
/ / c---c---c---c B /
/ / / \ /
/ / / b---b C \ /
/ / / / \ /
---o---o---o---o---o---o---o---o---o---o---o "master"
A, B and C are topic branches.
* A has one fix since it was merged up to "next".
* B has finished. It has been fully merged up to "master" and "next",
and is ready to be deleted.
* C has not merged to "next" at all.
We would want to allow C to be rebased, refuse A, and encourage
B to be deleted.
To compute (1):
git rev-list ^master ^topic next
git rev-list ^master next
if these match, topic has not merged in next at all.
To compute (2):
git rev-list master..topic
if this is empty, it is fully merged to "master".
DOC_END
@@ -1,24 +0,0 @@
#!/bin/sh
#
# An example hook script to make use of push options.
# The example simply echoes all push options that start with 'echoback='
# and rejects all pushes when the "reject" push option is used.
#
# To enable this hook, rename this file to "pre-receive".
if test -n "$GIT_PUSH_OPTION_COUNT"
then
i=0
while test "$i" -lt "$GIT_PUSH_OPTION_COUNT"
do
eval "value=\$GIT_PUSH_OPTION_$i"
case "$value" in
echoback=*)
echo "echo from the pre-receive-hook: ${value#*=}" >&2
;;
reject)
exit 1
esac
i=$((i + 1))
done
fi
@@ -1,42 +0,0 @@
#!/bin/sh
#
# An example hook script to prepare the commit log message.
# Called by "git commit" with the name of the file that has the
# commit message, followed by the description of the commit
# message's source. The hook's purpose is to edit the commit
# message file. If the hook fails with a non-zero status,
# the commit is aborted.
#
# To enable this hook, rename this file to "prepare-commit-msg".
# This hook includes three examples. The first one removes the
# "# Please enter the commit message..." help message.
#
# The second includes the output of "git diff --name-status -r"
# into the message, just before the "git status" output. It is
# commented because it doesn't cope with --amend or with squashed
# commits.
#
# The third example adds a Signed-off-by line to the message, that can
# still be edited. This is rarely a good idea.
COMMIT_MSG_FILE=$1
COMMIT_SOURCE=$2
SHA1=$3
/usr/bin/perl -i.bak -ne 'print unless(m/^. Please enter the commit message/..m/^#$/)' "$COMMIT_MSG_FILE"
# case "$COMMIT_SOURCE,$SHA1" in
# ,|template,)
# /usr/bin/perl -i.bak -pe '
# print "\n" . `git diff --cached --name-status -r`
# if /^#/ && $first++ == 0' "$COMMIT_MSG_FILE" ;;
# *) ;;
# esac
# SOB=$(git var GIT_COMMITTER_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')
# git interpret-trailers --in-place --trailer "$SOB" "$COMMIT_MSG_FILE"
# if test -z "$COMMIT_SOURCE"
# then
# /usr/bin/perl -i.bak -pe 'print "\n" if !$first_line++' "$COMMIT_MSG_FILE"
# fi
@@ -1,78 +0,0 @@
#!/bin/sh
# An example hook script to update a checked-out tree on a git push.
#
# This hook is invoked by git-receive-pack(1) when it reacts to git
# push and updates reference(s) in its repository, and when the push
# tries to update the branch that is currently checked out and the
# receive.denyCurrentBranch configuration variable is set to
# updateInstead.
#
# By default, such a push is refused if the working tree and the index
# of the remote repository has any difference from the currently
# checked out commit; when both the working tree and the index match
# the current commit, they are updated to match the newly pushed tip
# of the branch. This hook is to be used to override the default
# behaviour; however the code below reimplements the default behaviour
# as a starting point for convenient modification.
#
# The hook receives the commit with which the tip of the current
# branch is going to be updated:
commit=$1
# It can exit with a non-zero status to refuse the push (when it does
# so, it must not modify the index or the working tree).
die () {
echo >&2 "$*"
exit 1
}
# Or it can make any necessary changes to the working tree and to the
# index to bring them to the desired state when the tip of the current
# branch is updated to the new commit, and exit with a zero status.
#
# For example, the hook can simply run git read-tree -u -m HEAD "$1"
# in order to emulate git fetch that is run in the reverse direction
# with git push, as the two-tree form of git read-tree -u -m is
# essentially the same as git switch or git checkout that switches
# branches while keeping the local changes in the working tree that do
# not interfere with the difference between the branches.
# The below is a more-or-less exact translation to shell of the C code
# for the default behaviour for git's push-to-checkout hook defined in
# the push_to_deploy() function in builtin/receive-pack.c.
#
# Note that the hook will be executed from the repository directory,
# not from the working tree, so if you want to perform operations on
# the working tree, you will have to adapt your code accordingly, e.g.
# by adding "cd .." or using relative paths.
if ! git update-index -q --ignore-submodules --refresh
then
die "Up-to-date check failed"
fi
if ! git diff-files --quiet --ignore-submodules --
then
die "Working directory has unstaged changes"
fi
# This is a rough translation of:
#
# head_has_history() ? "HEAD" : EMPTY_TREE_SHA1_HEX
if git cat-file -e HEAD 2>/dev/null
then
head=HEAD
else
head=$(git hash-object -t tree --stdin </dev/null)
fi
if ! git diff-index --quiet --cached --ignore-submodules $head --
then
die "Working directory has staged changes"
fi
if ! git read-tree -u -m "$commit"
then
die "Could not update working tree to new HEAD"
fi
@@ -1,77 +0,0 @@
#!/bin/sh
# An example hook script to validate a patch (and/or patch series) before
# sending it via email.
#
# The hook should exit with non-zero status after issuing an appropriate
# message if it wants to prevent the email(s) from being sent.
#
# To enable this hook, rename this file to "sendemail-validate".
#
# By default, it will only check that the patch(es) can be applied on top of
# the default upstream branch without conflicts in a secondary worktree. After
# validation (successful or not) of the last patch of a series, the worktree
# will be deleted.
#
# The following config variables can be set to change the default remote and
# remote ref that are used to apply the patches against:
#
# sendemail.validateRemote (default: origin)
# sendemail.validateRemoteRef (default: HEAD)
#
# Replace the TODO placeholders with appropriate checks according to your
# needs.
validate_cover_letter () {
file="$1"
# TODO: Replace with appropriate checks (e.g. spell checking).
true
}
validate_patch () {
file="$1"
# Ensure that the patch applies without conflicts.
git am -3 "$file" || return
# TODO: Replace with appropriate checks for this patch
# (e.g. checkpatch.pl).
true
}
validate_series () {
# TODO: Replace with appropriate checks for the whole series
# (e.g. quick build, coding style checks, etc.).
true
}
# main -------------------------------------------------------------------------
if test "$GIT_SENDEMAIL_FILE_COUNTER" = 1
then
remote=$(git config --default origin --get sendemail.validateRemote) &&
ref=$(git config --default HEAD --get sendemail.validateRemoteRef) &&
worktree=$(mktemp --tmpdir -d sendemail-validate.XXXXXXX) &&
git worktree add -fd --checkout "$worktree" "refs/remotes/$remote/$ref" &&
git config --replace-all sendemail.validateWorktree "$worktree"
else
worktree=$(git config --get sendemail.validateWorktree)
fi || {
echo "sendemail-validate: error: failed to prepare worktree" >&2
exit 1
}
unset GIT_DIR GIT_WORK_TREE
cd "$worktree" &&
if grep -q "^diff --git " "$1"
then
validate_patch "$1"
else
validate_cover_letter "$1"
fi &&
if test "$GIT_SENDEMAIL_FILE_COUNTER" = "$GIT_SENDEMAIL_FILE_TOTAL"
then
git config --unset-all sendemail.validateWorktree &&
trap 'git worktree remove -ff "$worktree"' EXIT &&
validate_series
fi
@@ -1,128 +0,0 @@
#!/bin/sh
#
# An example hook script to block unannotated tags from entering.
# Called by "git receive-pack" with arguments: refname sha1-old sha1-new
#
# To enable this hook, rename this file to "update".
#
# Config
# ------
# hooks.allowunannotated
# This boolean sets whether unannotated tags will be allowed into the
# repository. By default they won't be.
# hooks.allowdeletetag
# This boolean sets whether deleting tags will be allowed in the
# repository. By default they won't be.
# hooks.allowmodifytag
# This boolean sets whether a tag may be modified after creation. By default
# it won't be.
# hooks.allowdeletebranch
# This boolean sets whether deleting branches will be allowed in the
# repository. By default they won't be.
# hooks.denycreatebranch
# This boolean sets whether remotely creating branches will be denied
# in the repository. By default this is allowed.
#
# --- Command line
refname="$1"
oldrev="$2"
newrev="$3"
# --- Safety check
if [ -z "$GIT_DIR" ]; then
echo "Don't run this script from the command line." >&2
echo " (if you want, you could supply GIT_DIR then run" >&2
echo " $0 <ref> <oldrev> <newrev>)" >&2
exit 1
fi
if [ -z "$refname" -o -z "$oldrev" -o -z "$newrev" ]; then
echo "usage: $0 <ref> <oldrev> <newrev>" >&2
exit 1
fi
# --- Config
allowunannotated=$(git config --type=bool hooks.allowunannotated)
allowdeletebranch=$(git config --type=bool hooks.allowdeletebranch)
denycreatebranch=$(git config --type=bool hooks.denycreatebranch)
allowdeletetag=$(git config --type=bool hooks.allowdeletetag)
allowmodifytag=$(git config --type=bool hooks.allowmodifytag)
# check for no description
projectdesc=$(sed -e '1q' "$GIT_DIR/description")
case "$projectdesc" in
"Unnamed repository"* | "")
echo "*** Project description file hasn't been set" >&2
exit 1
;;
esac
# --- Check types
# if $newrev is 0000...0000, it's a commit to delete a ref.
zero=$(git hash-object --stdin </dev/null | tr '[0-9a-f]' '0')
if [ "$newrev" = "$zero" ]; then
newrev_type=delete
else
newrev_type=$(git cat-file -t $newrev)
fi
case "$refname","$newrev_type" in
refs/tags/*,commit)
# un-annotated tag
short_refname=${refname##refs/tags/}
if [ "$allowunannotated" != "true" ]; then
echo "*** The un-annotated tag, $short_refname, is not allowed in this repository" >&2
echo "*** Use 'git tag [ -a | -s ]' for tags you want to propagate." >&2
exit 1
fi
;;
refs/tags/*,delete)
# delete tag
if [ "$allowdeletetag" != "true" ]; then
echo "*** Deleting a tag is not allowed in this repository" >&2
exit 1
fi
;;
refs/tags/*,tag)
# annotated tag
if [ "$allowmodifytag" != "true" ] && git rev-parse $refname > /dev/null 2>&1
then
echo "*** Tag '$refname' already exists." >&2
echo "*** Modifying a tag is not allowed in this repository." >&2
exit 1
fi
;;
refs/heads/*,commit)
# branch
if [ "$oldrev" = "$zero" -a "$denycreatebranch" = "true" ]; then
echo "*** Creating a branch is not allowed in this repository" >&2
exit 1
fi
;;
refs/heads/*,delete)
# delete branch
if [ "$allowdeletebranch" != "true" ]; then
echo "*** Deleting a branch is not allowed in this repository" >&2
exit 1
fi
;;
refs/remotes/*,commit)
# tracking branch
;;
refs/remotes/*,delete)
# delete tracking branch
if [ "$allowdeletebranch" != "true" ]; then
echo "*** Deleting a tracking branch is not allowed in this repository" >&2
exit 1
fi
;;
*)
# Anything else (is there anything else?)
echo "*** Update hook: unknown type of update to ref $refname of type $newrev_type" >&2
exit 1
;;
esac
# --- Finished
exit 0
@@ -1,6 +0,0 @@
# git ls-files --others --exclude-from=.git/info/exclude
# Lines that start with '#' are comments.
# For a project mostly in C, the following would be a good set of
# exclude patterns (uncomment them if you want to use them):
# *.[oa]
# *~
@@ -1 +0,0 @@
0000000000000000000000000000000000000000 63bd477487dac6d1284d89a143c843b06ef86986 Agent Zero Time Travel <time-travel@agent-zero.local> 1777743645 +0000
@@ -1 +0,0 @@
0000000000000000000000000000000000000000 63bd477487dac6d1284d89a143c843b06ef86986 Agent Zero Time Travel <time-travel@agent-zero.local> 1777743645 +0000
@@ -1 +0,0 @@
63bd477487dac6d1284d89a143c843b06ef86986
@@ -1 +0,0 @@
ref: refs/heads/current
@@ -1,9 +0,0 @@
[core]
repositoryformatversion = 0
filemode = true
bare = true
ignorecase = true
autocrlf = false
[user]
name = Agent Zero Time Travel
email = time-travel@agent-zero.local
@@ -1 +0,0 @@
Unnamed repository; edit this file 'description' to name the repository.
@@ -1,15 +0,0 @@
#!/bin/sh
#
# An example hook script to check the commit log message taken by
# applypatch from an e-mail message.
#
# The hook should exit with non-zero status after issuing an
# appropriate message if it wants to stop the commit. The hook is
# allowed to edit the commit message file.
#
# To enable this hook, rename this file to "applypatch-msg".
. git-sh-setup
commitmsg="$(git rev-parse --git-path hooks/commit-msg)"
test -x "$commitmsg" && exec "$commitmsg" ${1+"$@"}
:
@@ -1,24 +0,0 @@
#!/bin/sh
#
# An example hook script to check the commit log message.
# Called by "git commit" with one argument, the name of the file
# that has the commit message. The hook should exit with non-zero
# status after issuing an appropriate message if it wants to stop the
# commit. The hook is allowed to edit the commit message file.
#
# To enable this hook, rename this file to "commit-msg".
# Uncomment the below to add a Signed-off-by line to the message.
# Doing this in a hook is a bad idea in general, but the prepare-commit-msg
# hook is more suited to it.
#
# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')
# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1"
# This example catches duplicate Signed-off-by lines.
test "" = "$(grep '^Signed-off-by: ' "$1" |
sort | uniq -c | sed -e '/^[ ]*1[ ]/d')" || {
echo >&2 Duplicate Signed-off-by lines.
exit 1
}
@@ -1,174 +0,0 @@
#!/usr/bin/perl
use strict;
use warnings;
use IPC::Open2;
# An example hook script to integrate Watchman
# (https://facebook.github.io/watchman/) with git to speed up detecting
# new and modified files.
#
# The hook is passed a version (currently 2) and last update token
# formatted as a string and outputs to stdout a new update token and
# all files that have been modified since the update token. Paths must
# be relative to the root of the working tree and separated by a single NUL.
#
# To enable this hook, rename this file to "query-watchman" and set
# 'git config core.fsmonitor .git/hooks/query-watchman'
#
my ($version, $last_update_token) = @ARGV;
# Uncomment for debugging
# print STDERR "$0 $version $last_update_token\n";
# Check the hook interface version
if ($version ne 2) {
die "Unsupported query-fsmonitor hook version '$version'.\n" .
"Falling back to scanning...\n";
}
my $git_work_tree = get_working_dir();
my $retry = 1;
my $json_pkg;
eval {
require JSON::XS;
$json_pkg = "JSON::XS";
1;
} or do {
require JSON::PP;
$json_pkg = "JSON::PP";
};
launch_watchman();
sub launch_watchman {
my $o = watchman_query();
if (is_work_tree_watched($o)) {
output_result($o->{clock}, @{$o->{files}});
}
}
sub output_result {
my ($clockid, @files) = @_;
# Uncomment for debugging watchman output
# open (my $fh, ">", ".git/watchman-output.out");
# binmode $fh, ":utf8";
# print $fh "$clockid\n@files\n";
# close $fh;
binmode STDOUT, ":utf8";
print $clockid;
print "\0";
local $, = "\0";
print @files;
}
sub watchman_clock {
my $response = qx/watchman clock "$git_work_tree"/;
die "Failed to get clock id on '$git_work_tree'.\n" .
"Falling back to scanning...\n" if $? != 0;
return $json_pkg->new->utf8->decode($response);
}
sub watchman_query {
my $pid = open2(\*CHLD_OUT, \*CHLD_IN, 'watchman -j --no-pretty')
or die "open2() failed: $!\n" .
"Falling back to scanning...\n";
# In the query expression below we're asking for names of files that
# changed since $last_update_token but not from the .git folder.
#
# To accomplish this, we're using the "since" generator to use the
# recency index to select candidate nodes and "fields" to limit the
# output to file names only. Then we're using the "expression" term to
# further constrain the results.
my $last_update_line = "";
if (substr($last_update_token, 0, 1) eq "c") {
$last_update_token = "\"$last_update_token\"";
$last_update_line = qq[\n"since": $last_update_token,];
}
my $query = <<" END";
["query", "$git_work_tree", {$last_update_line
"fields": ["name"],
"expression": ["not", ["dirname", ".git"]]
}]
END
# Uncomment for debugging the watchman query
# open (my $fh, ">", ".git/watchman-query.json");
# print $fh $query;
# close $fh;
print CHLD_IN $query;
close CHLD_IN;
my $response = do {local $/; <CHLD_OUT>};
# Uncomment for debugging the watch response
# open ($fh, ">", ".git/watchman-response.json");
# print $fh $response;
# close $fh;
die "Watchman: command returned no output.\n" .
"Falling back to scanning...\n" if $response eq "";
die "Watchman: command returned invalid output: $response\n" .
"Falling back to scanning...\n" unless $response =~ /^\{/;
return $json_pkg->new->utf8->decode($response);
}
sub is_work_tree_watched {
my ($output) = @_;
my $error = $output->{error};
if ($retry > 0 and $error and $error =~ m/unable to resolve root .* directory (.*) is not watched/) {
$retry--;
my $response = qx/watchman watch "$git_work_tree"/;
die "Failed to make watchman watch '$git_work_tree'.\n" .
"Falling back to scanning...\n" if $? != 0;
$output = $json_pkg->new->utf8->decode($response);
$error = $output->{error};
die "Watchman: $error.\n" .
"Falling back to scanning...\n" if $error;
# Uncomment for debugging watchman output
# open (my $fh, ">", ".git/watchman-output.out");
# close $fh;
# Watchman will always return all files on the first query so
# return the fast "everything is dirty" flag to git and do the
# Watchman query just to get it over with now so we won't pay
# the cost in git to look up each individual file.
my $o = watchman_clock();
$error = $output->{error};
die "Watchman: $error.\n" .
"Falling back to scanning...\n" if $error;
output_result($o->{clock}, ("/"));
$last_update_token = $o->{clock};
eval { launch_watchman() };
return 0;
}
die "Watchman: $error.\n" .
"Falling back to scanning...\n" if $error;
return 1;
}
sub get_working_dir {
my $working_dir;
if ($^O =~ 'msys' || $^O =~ 'cygwin') {
$working_dir = Win32::GetCwd();
$working_dir =~ tr/\\/\//;
} else {
require Cwd;
$working_dir = Cwd::cwd();
}
return $working_dir;
}
@@ -1,8 +0,0 @@
#!/bin/sh
#
# An example hook script to prepare a packed repository for use over
# dumb transports.
#
# To enable this hook, rename this file to "post-update".
exec git update-server-info
@@ -1,14 +0,0 @@
#!/bin/sh
#
# An example hook script to verify what is about to be committed
# by applypatch from an e-mail message.
#
# The hook should exit with non-zero status after issuing an
# appropriate message if it wants to stop the commit.
#
# To enable this hook, rename this file to "pre-applypatch".
. git-sh-setup
precommit="$(git rev-parse --git-path hooks/pre-commit)"
test -x "$precommit" && exec "$precommit" ${1+"$@"}
:
@@ -1,49 +0,0 @@
#!/bin/sh
#
# An example hook script to verify what is about to be committed.
# Called by "git commit" with no arguments. The hook should
# exit with non-zero status after issuing an appropriate message if
# it wants to stop the commit.
#
# To enable this hook, rename this file to "pre-commit".
if git rev-parse --verify HEAD >/dev/null 2>&1
then
against=HEAD
else
# Initial commit: diff against an empty tree object
against=$(git hash-object -t tree /dev/null)
fi
# If you want to allow non-ASCII filenames set this variable to true.
allownonascii=$(git config --type=bool hooks.allownonascii)
# Redirect output to stderr.
exec 1>&2
# Cross platform projects tend to avoid non-ASCII filenames; prevent
# them from being added to the repository. We exploit the fact that the
# printable range starts at the space character and ends with tilde.
if [ "$allownonascii" != "true" ] &&
# Note that the use of brackets around a tr range is ok here, (it's
# even required, for portability to Solaris 10's /usr/bin/tr), since
# the square bracket bytes happen to fall in the designated range.
test $(git diff-index --cached --name-only --diff-filter=A -z $against |
LC_ALL=C tr -d '[ -~]\0' | wc -c) != 0
then
cat <<\EOF
Error: Attempt to add a non-ASCII file name.
This can cause problems if you want to work with people on other platforms.
To be portable it is advisable to rename the file.
If you know what you are doing you can disable this check using:
git config hooks.allownonascii true
EOF
exit 1
fi
# If there are whitespace errors, print the offending file names and fail.
exec git diff-index --check --cached $against --
@@ -1,13 +0,0 @@
#!/bin/sh
#
# An example hook script to verify what is about to be committed.
# Called by "git merge" with no arguments. The hook should
# exit with non-zero status after issuing an appropriate message to
# stderr if it wants to stop the merge commit.
#
# To enable this hook, rename this file to "pre-merge-commit".
. git-sh-setup
test -x "$GIT_DIR/hooks/pre-commit" &&
exec "$GIT_DIR/hooks/pre-commit"
:
@@ -1,53 +0,0 @@
#!/bin/sh
# An example hook script to verify what is about to be pushed. Called by "git
# push" after it has checked the remote status, but before anything has been
# pushed. If this script exits with a non-zero status nothing will be pushed.
#
# This hook is called with the following parameters:
#
# $1 -- Name of the remote to which the push is being done
# $2 -- URL to which the push is being done
#
# If pushing without using a named remote those arguments will be equal.
#
# Information about the commits which are being pushed is supplied as lines to
# the standard input in the form:
#
# <local ref> <local oid> <remote ref> <remote oid>
#
# This sample shows how to prevent push of commits where the log message starts
# with "WIP" (work in progress).
remote="$1"
url="$2"
zero=$(git hash-object --stdin </dev/null | tr '[0-9a-f]' '0')
while read local_ref local_oid remote_ref remote_oid
do
if test "$local_oid" = "$zero"
then
# Handle delete
:
else
if test "$remote_oid" = "$zero"
then
# New branch, examine all commits
range="$local_oid"
else
# Update to existing branch, examine new commits
range="$remote_oid..$local_oid"
fi
# Check for WIP commit
commit=$(git rev-list -n 1 --grep '^WIP' "$range")
if test -n "$commit"
then
echo >&2 "Found WIP commit in $local_ref, not pushing"
exit 1
fi
fi
done
exit 0
@@ -1,169 +0,0 @@
#!/bin/sh
#
# Copyright (c) 2006, 2008 Junio C Hamano
#
# The "pre-rebase" hook is run just before "git rebase" starts doing
# its job, and can prevent the command from running by exiting with
# non-zero status.
#
# The hook is called with the following parameters:
#
# $1 -- the upstream the series was forked from.
# $2 -- the branch being rebased (or empty when rebasing the current branch).
#
# This sample shows how to prevent topic branches that are already
# merged to 'next' branch from getting rebased, because allowing it
# would result in rebasing already published history.
publish=next
basebranch="$1"
if test "$#" = 2
then
topic="refs/heads/$2"
else
topic=`git symbolic-ref HEAD` ||
exit 0 ;# we do not interrupt rebasing detached HEAD
fi
case "$topic" in
refs/heads/??/*)
;;
*)
exit 0 ;# we do not interrupt others.
;;
esac
# Now we are dealing with a topic branch being rebased
# on top of master. Is it OK to rebase it?
# Does the topic really exist?
git show-ref -q "$topic" || {
echo >&2 "No such branch $topic"
exit 1
}
# Is topic fully merged to master?
not_in_master=`git rev-list --pretty=oneline ^master "$topic"`
if test -z "$not_in_master"
then
echo >&2 "$topic is fully merged to master; better remove it."
exit 1 ;# we could allow it, but there is no point.
fi
# Is topic ever merged to next? If so you should not be rebasing it.
only_next_1=`git rev-list ^master "^$topic" ${publish} | sort`
only_next_2=`git rev-list ^master ${publish} | sort`
if test "$only_next_1" = "$only_next_2"
then
not_in_topic=`git rev-list "^$topic" master`
if test -z "$not_in_topic"
then
echo >&2 "$topic is already up to date with master"
exit 1 ;# we could allow it, but there is no point.
else
exit 0
fi
else
not_in_next=`git rev-list --pretty=oneline ^${publish} "$topic"`
/usr/bin/perl -e '
my $topic = $ARGV[0];
my $msg = "* $topic has commits already merged to public branch:\n";
my (%not_in_next) = map {
/^([0-9a-f]+) /;
($1 => 1);
} split(/\n/, $ARGV[1]);
for my $elem (map {
/^([0-9a-f]+) (.*)$/;
[$1 => $2];
} split(/\n/, $ARGV[2])) {
if (!exists $not_in_next{$elem->[0]}) {
if ($msg) {
print STDERR $msg;
undef $msg;
}
print STDERR " $elem->[1]\n";
}
}
' "$topic" "$not_in_next" "$not_in_master"
exit 1
fi
<<\DOC_END
This sample hook safeguards topic branches that have been
published from being rewound.
The workflow assumed here is:
* Once a topic branch forks from "master", "master" is never
merged into it again (either directly or indirectly).
* Once a topic branch is fully cooked and merged into "master",
it is deleted. If you need to build on top of it to correct
earlier mistakes, a new topic branch is created by forking at
the tip of the "master". This is not strictly necessary, but
it makes it easier to keep your history simple.
* Whenever you need to test or publish your changes to topic
branches, merge them into "next" branch.
The script, being an example, hardcodes the publish branch name
to be "next", but it is trivial to make it configurable via
$GIT_DIR/config mechanism.
With this workflow, you would want to know:
(1) ... if a topic branch has ever been merged to "next". Young
topic branches can have stupid mistakes you would rather
clean up before publishing, and things that have not been
merged into other branches can be easily rebased without
affecting other people. But once it is published, you would
not want to rewind it.
(2) ... if a topic branch has been fully merged to "master".
Then you can delete it. More importantly, you should not
build on top of it -- other people may already want to
change things related to the topic as patches against your
"master", so if you need further changes, it is better to
fork the topic (perhaps with the same name) afresh from the
tip of "master".
Let's look at this example:
o---o---o---o---o---o---o---o---o---o "next"
/ / / /
/ a---a---b A / /
/ / / /
/ / c---c---c---c B /
/ / / \ /
/ / / b---b C \ /
/ / / / \ /
---o---o---o---o---o---o---o---o---o---o---o "master"
A, B and C are topic branches.
* A has one fix since it was merged up to "next".
* B has finished. It has been fully merged up to "master" and "next",
and is ready to be deleted.
* C has not merged to "next" at all.
We would want to allow C to be rebased, refuse A, and encourage
B to be deleted.
To compute (1):
git rev-list ^master ^topic next
git rev-list ^master next
if these match, topic has not merged in next at all.
To compute (2):
git rev-list master..topic
if this is empty, it is fully merged to "master".
DOC_END
@@ -1,24 +0,0 @@
#!/bin/sh
#
# An example hook script to make use of push options.
# The example simply echoes all push options that start with 'echoback='
# and rejects all pushes when the "reject" push option is used.
#
# To enable this hook, rename this file to "pre-receive".
if test -n "$GIT_PUSH_OPTION_COUNT"
then
i=0
while test "$i" -lt "$GIT_PUSH_OPTION_COUNT"
do
eval "value=\$GIT_PUSH_OPTION_$i"
case "$value" in
echoback=*)
echo "echo from the pre-receive-hook: ${value#*=}" >&2
;;
reject)
exit 1
esac
i=$((i + 1))
done
fi
@@ -1,42 +0,0 @@
#!/bin/sh
#
# An example hook script to prepare the commit log message.
# Called by "git commit" with the name of the file that has the
# commit message, followed by the description of the commit
# message's source. The hook's purpose is to edit the commit
# message file. If the hook fails with a non-zero status,
# the commit is aborted.
#
# To enable this hook, rename this file to "prepare-commit-msg".
# This hook includes three examples. The first one removes the
# "# Please enter the commit message..." help message.
#
# The second includes the output of "git diff --name-status -r"
# into the message, just before the "git status" output. It is
# commented because it doesn't cope with --amend or with squashed
# commits.
#
# The third example adds a Signed-off-by line to the message, that can
# still be edited. This is rarely a good idea.
COMMIT_MSG_FILE=$1
COMMIT_SOURCE=$2
SHA1=$3
/usr/bin/perl -i.bak -ne 'print unless(m/^. Please enter the commit message/..m/^#$/)' "$COMMIT_MSG_FILE"
# case "$COMMIT_SOURCE,$SHA1" in
# ,|template,)
# /usr/bin/perl -i.bak -pe '
# print "\n" . `git diff --cached --name-status -r`
# if /^#/ && $first++ == 0' "$COMMIT_MSG_FILE" ;;
# *) ;;
# esac
# SOB=$(git var GIT_COMMITTER_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')
# git interpret-trailers --in-place --trailer "$SOB" "$COMMIT_MSG_FILE"
# if test -z "$COMMIT_SOURCE"
# then
# /usr/bin/perl -i.bak -pe 'print "\n" if !$first_line++' "$COMMIT_MSG_FILE"
# fi
@@ -1,78 +0,0 @@
#!/bin/sh
# An example hook script to update a checked-out tree on a git push.
#
# This hook is invoked by git-receive-pack(1) when it reacts to git
# push and updates reference(s) in its repository, and when the push
# tries to update the branch that is currently checked out and the
# receive.denyCurrentBranch configuration variable is set to
# updateInstead.
#
# By default, such a push is refused if the working tree and the index
# of the remote repository has any difference from the currently
# checked out commit; when both the working tree and the index match
# the current commit, they are updated to match the newly pushed tip
# of the branch. This hook is to be used to override the default
# behaviour; however the code below reimplements the default behaviour
# as a starting point for convenient modification.
#
# The hook receives the commit with which the tip of the current
# branch is going to be updated:
commit=$1
# It can exit with a non-zero status to refuse the push (when it does
# so, it must not modify the index or the working tree).
die () {
echo >&2 "$*"
exit 1
}
# Or it can make any necessary changes to the working tree and to the
# index to bring them to the desired state when the tip of the current
# branch is updated to the new commit, and exit with a zero status.
#
# For example, the hook can simply run git read-tree -u -m HEAD "$1"
# in order to emulate git fetch that is run in the reverse direction
# with git push, as the two-tree form of git read-tree -u -m is
# essentially the same as git switch or git checkout that switches
# branches while keeping the local changes in the working tree that do
# not interfere with the difference between the branches.
# The below is a more-or-less exact translation to shell of the C code
# for the default behaviour for git's push-to-checkout hook defined in
# the push_to_deploy() function in builtin/receive-pack.c.
#
# Note that the hook will be executed from the repository directory,
# not from the working tree, so if you want to perform operations on
# the working tree, you will have to adapt your code accordingly, e.g.
# by adding "cd .." or using relative paths.
if ! git update-index -q --ignore-submodules --refresh
then
die "Up-to-date check failed"
fi
if ! git diff-files --quiet --ignore-submodules --
then
die "Working directory has unstaged changes"
fi
# This is a rough translation of:
#
# head_has_history() ? "HEAD" : EMPTY_TREE_SHA1_HEX
if git cat-file -e HEAD 2>/dev/null
then
head=HEAD
else
head=$(git hash-object -t tree --stdin </dev/null)
fi
if ! git diff-index --quiet --cached --ignore-submodules $head --
then
die "Working directory has staged changes"
fi
if ! git read-tree -u -m "$commit"
then
die "Could not update working tree to new HEAD"
fi
@@ -1,77 +0,0 @@
#!/bin/sh
# An example hook script to validate a patch (and/or patch series) before
# sending it via email.
#
# The hook should exit with non-zero status after issuing an appropriate
# message if it wants to prevent the email(s) from being sent.
#
# To enable this hook, rename this file to "sendemail-validate".
#
# By default, it will only check that the patch(es) can be applied on top of
# the default upstream branch without conflicts in a secondary worktree. After
# validation (successful or not) of the last patch of a series, the worktree
# will be deleted.
#
# The following config variables can be set to change the default remote and
# remote ref that are used to apply the patches against:
#
# sendemail.validateRemote (default: origin)
# sendemail.validateRemoteRef (default: HEAD)
#
# Replace the TODO placeholders with appropriate checks according to your
# needs.
validate_cover_letter () {
file="$1"
# TODO: Replace with appropriate checks (e.g. spell checking).
true
}
validate_patch () {
file="$1"
# Ensure that the patch applies without conflicts.
git am -3 "$file" || return
# TODO: Replace with appropriate checks for this patch
# (e.g. checkpatch.pl).
true
}
validate_series () {
# TODO: Replace with appropriate checks for the whole series
# (e.g. quick build, coding style checks, etc.).
true
}
# main -------------------------------------------------------------------------
if test "$GIT_SENDEMAIL_FILE_COUNTER" = 1
then
remote=$(git config --default origin --get sendemail.validateRemote) &&
ref=$(git config --default HEAD --get sendemail.validateRemoteRef) &&
worktree=$(mktemp --tmpdir -d sendemail-validate.XXXXXXX) &&
git worktree add -fd --checkout "$worktree" "refs/remotes/$remote/$ref" &&
git config --replace-all sendemail.validateWorktree "$worktree"
else
worktree=$(git config --get sendemail.validateWorktree)
fi || {
echo "sendemail-validate: error: failed to prepare worktree" >&2
exit 1
}
unset GIT_DIR GIT_WORK_TREE
cd "$worktree" &&
if grep -q "^diff --git " "$1"
then
validate_patch "$1"
else
validate_cover_letter "$1"
fi &&
if test "$GIT_SENDEMAIL_FILE_COUNTER" = "$GIT_SENDEMAIL_FILE_TOTAL"
then
git config --unset-all sendemail.validateWorktree &&
trap 'git worktree remove -ff "$worktree"' EXIT &&
validate_series
fi
@@ -1,128 +0,0 @@
#!/bin/sh
#
# An example hook script to block unannotated tags from entering.
# Called by "git receive-pack" with arguments: refname sha1-old sha1-new
#
# To enable this hook, rename this file to "update".
#
# Config
# ------
# hooks.allowunannotated
# This boolean sets whether unannotated tags will be allowed into the
# repository. By default they won't be.
# hooks.allowdeletetag
# This boolean sets whether deleting tags will be allowed in the
# repository. By default they won't be.
# hooks.allowmodifytag
# This boolean sets whether a tag may be modified after creation. By default
# it won't be.
# hooks.allowdeletebranch
# This boolean sets whether deleting branches will be allowed in the
# repository. By default they won't be.
# hooks.denycreatebranch
# This boolean sets whether remotely creating branches will be denied
# in the repository. By default this is allowed.
#
# --- Command line
refname="$1"
oldrev="$2"
newrev="$3"
# --- Safety check
if [ -z "$GIT_DIR" ]; then
echo "Don't run this script from the command line." >&2
echo " (if you want, you could supply GIT_DIR then run" >&2
echo " $0 <ref> <oldrev> <newrev>)" >&2
exit 1
fi
if [ -z "$refname" -o -z "$oldrev" -o -z "$newrev" ]; then
echo "usage: $0 <ref> <oldrev> <newrev>" >&2
exit 1
fi
# --- Config
allowunannotated=$(git config --type=bool hooks.allowunannotated)
allowdeletebranch=$(git config --type=bool hooks.allowdeletebranch)
denycreatebranch=$(git config --type=bool hooks.denycreatebranch)
allowdeletetag=$(git config --type=bool hooks.allowdeletetag)
allowmodifytag=$(git config --type=bool hooks.allowmodifytag)
# check for no description
projectdesc=$(sed -e '1q' "$GIT_DIR/description")
case "$projectdesc" in
"Unnamed repository"* | "")
echo "*** Project description file hasn't been set" >&2
exit 1
;;
esac
# --- Check types
# if $newrev is 0000...0000, it's a commit to delete a ref.
zero=$(git hash-object --stdin </dev/null | tr '[0-9a-f]' '0')
if [ "$newrev" = "$zero" ]; then
newrev_type=delete
else
newrev_type=$(git cat-file -t $newrev)
fi
case "$refname","$newrev_type" in
refs/tags/*,commit)
# un-annotated tag
short_refname=${refname##refs/tags/}
if [ "$allowunannotated" != "true" ]; then
echo "*** The un-annotated tag, $short_refname, is not allowed in this repository" >&2
echo "*** Use 'git tag [ -a | -s ]' for tags you want to propagate." >&2
exit 1
fi
;;
refs/tags/*,delete)
# delete tag
if [ "$allowdeletetag" != "true" ]; then
echo "*** Deleting a tag is not allowed in this repository" >&2
exit 1
fi
;;
refs/tags/*,tag)
# annotated tag
if [ "$allowmodifytag" != "true" ] && git rev-parse $refname > /dev/null 2>&1
then
echo "*** Tag '$refname' already exists." >&2
echo "*** Modifying a tag is not allowed in this repository." >&2
exit 1
fi
;;
refs/heads/*,commit)
# branch
if [ "$oldrev" = "$zero" -a "$denycreatebranch" = "true" ]; then
echo "*** Creating a branch is not allowed in this repository" >&2
exit 1
fi
;;
refs/heads/*,delete)
# delete branch
if [ "$allowdeletebranch" != "true" ]; then
echo "*** Deleting a branch is not allowed in this repository" >&2
exit 1
fi
;;
refs/remotes/*,commit)
# tracking branch
;;
refs/remotes/*,delete)
# delete tracking branch
if [ "$allowdeletebranch" != "true" ]; then
echo "*** Deleting a tracking branch is not allowed in this repository" >&2
exit 1
fi
;;
*)
# Anything else (is there anything else?)
echo "*** Update hook: unknown type of update to ref $refname of type $newrev_type" >&2
exit 1
;;
esac
# --- Finished
exit 0
@@ -1,6 +0,0 @@
# git ls-files --others --exclude-from=.git/info/exclude
# Lines that start with '#' are comments.
# For a project mostly in C, the following would be a good set of
# exclude patterns (uncomment them if you want to use them):
# *.[oa]
# *~

Some files were not shown because too many files have changed in this diff Show More