#!/usr/bin/env bash # benchmark.sh — reproducible Claude Code benchmark harness. # # Runs a fixed task set against one or more models in headless mode, captures # token usage + wall time + cost, and emits a CSV. Designed to produce the kind # of numbers reported in guides/benchmarks.md. # # Usage: # bash tools/benchmark.sh --repo /path/to/repo --models sonnet,haiku --out results.csv # bash tools/benchmark.sh --help # # Requires: claude (>= 2.1.139), jq, bc. Optional: a git-clean target repo. set -euo pipefail # ----- defaults ----- REPO="$PWD" MODELS="sonnet-4.6" OUT="benchmark-$(date +%Y%m%d-%H%M%S).csv" RUNS=3 PLAN_MODE="" TASKS="T1,T2,T3,T4,T5" # Prices per million tokens, USD. Update when list prices change. # Format: input,output (cache-hit input is ~10% of input per Anthropic docs) declare -A PRICE_IN PRICE_OUT PRICE_IN[opus-4.7]=15.00 ; PRICE_OUT[opus-4.7]=75.00 PRICE_IN[sonnet-4.6]=3.00 ; PRICE_OUT[sonnet-4.6]=15.00 PRICE_IN[haiku-4.5]=0.80 ; PRICE_OUT[haiku-4.5]=4.00 usage() { cat <<'EOF' benchmark.sh — reproducible Claude Code benchmark harness Flags: --repo PATH Target repo to benchmark against (default: $PWD) --models LIST Comma-separated: opus-4.7,sonnet-4.6,haiku-4.5 (default: sonnet-4.6) --tasks LIST Comma-separated task IDs (default: T1,T2,T3,T4,T5). See --list-tasks. --runs N Runs per (model,task) cell; median reported (default: 3) --plan-mode Also run with plan mode on for each task --out FILE Output CSV path (default: benchmark-.csv) --list-tasks Print the task catalog and exit --help This message Output CSV columns: task,model,run,plan_mode,input_tokens,output_tokens,duration_s,cost_usd,outcome Notes: * Runs use `claude -p "" --output-format json --model `. * Each run starts a fresh session (no carry-over context). * Outcome is "pass" if claude exited 0 and produced non-empty output; "fail" otherwise. Deeper semantic pass/fail (did the test actually become green?) is task-specific and left to the reader — see the post-run hooks in TASK_POSTCHECK. EOF } # ----- task catalog ----- # Each task: id | description | prompt | postcheck-command (optional, evaluated in $REPO) read -r -d '' TASK_CATALOG <<'EOF' || true T1|Fix a known failing test|Find the single failing test in this repo, fix it, and confirm the suite passes. Do not touch unrelated files.| T2|Add --dry-run flag|Add a --dry-run flag to the CLI entrypoint that prints what would happen without executing. Update help text and add a test.| T3|Rename function repo-wide|Rename UserService.fetchProfile to getProfile across the entire repo. Update all call sites and tests.| T4|Write unit tests|Write a thorough unit test file for the primary module in src/ (pick the most central one). Aim for ~80% branch coverage.| T5|Explain and flag risks|Explain the authentication flow in this repo. Flag any security risks or code smells you find. No edits — answer only.| T6|Framework migration|Migrate the main HTTP framework one major version forward (e.g. Express 4→5, Flask 2→3). Update all deprecated APIs.| EOF list_tasks() { printf "%-4s %s\n" "ID" "Description" printf "%-4s %s\n" "--" "-----------" while IFS='|' read -r id desc _ _; do [ -z "$id" ] && continue printf "%-4s %s\n" "$id" "$desc" done <<< "$TASK_CATALOG" } # ----- parse args ----- while [ $# -gt 0 ]; do case "$1" in --repo) REPO="$2"; shift 2;; --models) MODELS="$2"; shift 2;; --tasks) TASKS="$2"; shift 2;; --runs) RUNS="$2"; shift 2;; --plan-mode) PLAN_MODE="1"; shift;; --out) OUT="$2"; shift 2;; --list-tasks) list_tasks; exit 0;; --help|-h) usage; exit 0;; *) echo "unknown flag: $1" >&2; usage; exit 1;; esac done # ----- preflight ----- for tool in claude jq bc; do if ! command -v "$tool" >/dev/null 2>&1; then echo "benchmark: missing required tool: $tool" >&2 exit 1 fi done if [ ! -d "$REPO" ]; then echo "benchmark: --repo is not a directory: $REPO" >&2 exit 1 fi # ----- helpers ----- median() { # median of stdin (one number per line). Simple sort + mid. sort -n | awk ' { a[NR]=$1 } END { if (NR==0) { print 0; exit } if (NR%2==1) print a[(NR+1)/2] else print (a[NR/2] + a[NR/2+1]) / 2 }' } cost_usd() { local model="$1" in_tokens="$2" out_tokens="$3" local pin="${PRICE_IN[$model]:-0}" pout="${PRICE_OUT[$model]:-0}" echo "scale=4; ($in_tokens * $pin + $out_tokens * $pout) / 1000000" | bc } get_task_prompt() { local want="$1" while IFS='|' read -r id _ prompt _; do [ "$id" = "$want" ] && { echo "$prompt"; return; } done <<< "$TASK_CATALOG" echo "" } # ----- run one cell ----- run_one() { local task="$1" model="$2" run="$3" plan="$4" local prompt; prompt="$(get_task_prompt "$task")" [ -z "$prompt" ] && { echo "unknown task: $task" >&2; return 1; } local plan_flag="" [ "$plan" = "1" ] && plan_flag="--permission-mode plan" local start end duration start=$(date +%s) # Run headless. --output-format json surfaces token usage in the result. local out if ! out="$(cd "$REPO" && claude -p "$prompt" --output-format json --model "$model" $plan_flag 2>/dev/null)"; then end=$(date +%s); duration=$((end - start)) echo "$task,$model,$run,$plan,0,0,$duration,0.0000,fail" return 0 fi end=$(date +%s); duration=$((end - start)) local in_tokens out_tokens outcome cost in_tokens="$(echo "$out" | jq -r '.usage.input_tokens // 0')" out_tokens="$(echo "$out" | jq -r '.usage.output_tokens // 0')" outcome="$(echo "$out" | jq -r 'if .result and (.result|length) > 0 then "pass" else "fail" end')" cost="$(cost_usd "$model" "$in_tokens" "$out_tokens")" echo "$task,$model,$run,$plan,$in_tokens,$out_tokens,$duration,$cost,$outcome" } # ----- main loop ----- echo "task,model,run,plan_mode,input_tokens,output_tokens,duration_s,cost_usd,outcome" > "$OUT" IFS=',' read -ra task_list <<< "$TASKS" IFS=',' read -ra model_list <<< "$MODELS" plan_modes=("0") [ "$PLAN_MODE" = "1" ] && plan_modes=("0" "1") total_cells=$(( ${#task_list[@]} * ${#model_list[@]} * ${#plan_modes[@]} * RUNS )) cell=0 for task in "${task_list[@]}"; do for model in "${model_list[@]}"; do for plan in "${plan_modes[@]}"; do for run in $(seq 1 "$RUNS"); do cell=$((cell + 1)) echo "[$cell/$total_cells] task=$task model=$model plan=$plan run=$run" >&2 line="$(run_one "$task" "$model" "$run" "$plan")" echo "$line" | tee -a "$OUT" done done done done echo echo "=== Summary (median per cell) ===" echo # Group by (task, model, plan) and print medians. { echo "task|model|plan|in_med|out_med|dur_med|cost_med|pass_rate" tail -n +2 "$OUT" | awk -F, ' { k=$1"|"$2"|"$4 n[k]++ in_v[k]=in_v[k]" "$5 out_v[k]=out_v[k]" "$6 dur_v[k]=dur_v[k]" "$7 cost_v[k]=cost_v[k]" "$8 if ($9=="pass") pass[k]++ } END { for (k in n) { printf "%s|%s|%s|%s|%s|%s|%.0f%%\n", k, in_v[k], out_v[k], dur_v[k], cost_v[k], (pass[k]/n[k])*100 } }' | while IFS='|' read -r task model plan in_v out_v dur_v cost_v pass; do in_med=$(echo "$in_v" | tr ' ' '\n' | grep -v '^$' | median) out_med=$(echo "$out_v" | tr ' ' '\n' | grep -v '^$' | median) dur_med=$(echo "$dur_v" | tr ' ' '\n' | grep -v '^$' | median) cost_med=$(echo "$cost_v"| tr ' ' '\n' | grep -v '^$' | median) echo "$task|$model|$plan|$in_med|$out_med|${dur_med}s|\$$cost_med|$pass" done } | column -t -s '|' echo echo "Wrote raw results to: $OUT"