feat: add copilot-cli and gemini-cli agent modes to eval runner

Copilot · kdinev · Copilot · commit b181ca0600f3 · 2026-03-10T09:32:17.000Z
Co-authored-by: kdinev &lt;1472513+kdinev@users.noreply.github.com&gt;
diff --git a/.github/workflows/skill-eval.yml b/.github/workflows/skill-eval.yml
@@ -5,15 +5,31 @@ on:
     paths:
       - 'skills/**'
       - 'evals/**'
+  workflow_dispatch:
+    inputs:
+      agent:
+        description: 'Agent to run evals against (copilot or gemini)'
+        required: true
+        default: 'copilot'
+        type: choice
+        options:
+          - copilot
+          - gemini
+      trials:
+        description: 'Number of trials per task'
+        required: false
+        default: '1'
+        type: string
 
 permissions:
   contents: read
   pull-requests: write
 
 jobs:
-  skill_eval:
+  # Job 1: Always validate graders against reference solutions
+  validate_graders:
     runs-on: ubuntu-latest
-    timeout-minutes: 30
+    timeout-minutes: 10
 
     steps:
       - name: Checkout repository
@@ -28,16 +44,70 @@ jobs:
         working-directory: evals
         run: bash run-eval.sh --all --validate
 
-      - name: Upload results
+      - name: Upload validation results
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: skill-eval-results
+          name: skill-eval-validation-results
           path: evals/results/
           retention-days: 30
 
+  # Job 2: Run evals against an AI agent (copilot or gemini)
+  # Triggered manually via workflow_dispatch, or can be called from other workflows
+  agent_eval:
+    if: github.event_name == 'workflow_dispatch'
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: '22'
+
+      - name: Install Copilot CLI
+        if: inputs.agent == 'copilot'
+        run: npm install -g @github/copilot
+
+      - name: Install Gemini CLI
+        if: inputs.agent == 'gemini'
+        run: npm install -g @google/gemini-cli
+
+      - name: Run agent-based eval
+        working-directory: evals
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
+        run: |
+          bash run-eval.sh --all \
+            --agent ${{ inputs.agent }} \
+            --trials ${{ inputs.trials || '1' }}
+
+      - name: Upload agent eval results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: skill-eval-agent-${{ inputs.agent }}-results
+          path: evals/results/
+          retention-days: 30
+
+  # Job 3: Post summary comment on PRs
+  post_summary:
+    if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork == false
+    needs: [validate_graders]
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Download validation results
+        uses: actions/download-artifact@v4
+        with:
+          name: skill-eval-validation-results
+          path: evals/results/
+
       - name: Post summary comment
-        if: always() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork == false
         uses: actions/github-script@v7
         with:
           script: |
@@ -52,26 +122,27 @@ jobs:
               if (files.length === 0) {
                 summary += '> ⚠️ No eval results found. The eval run may have failed.\n';
               } else {
-                summary += '| Task | Pass Rate | pass@5 | Status |\n';
-                summary += '|---|---|---|---|\n';
+                summary += '| Task | Agent | Pass Rate | pass@k | Status |\n';
+                summary += '|---|---|---|---|---|\n';
 
                 for (const file of files) {
                   try {
                     const data = JSON.parse(fs.readFileSync(path.join(resultsDir, file), 'utf8'));
                     const taskName = data.task || file.replace('.json', '');
+                    const agent = data.agent || 'reference';
                     const passRate = data.passRate != null ? `${(data.passRate * 100).toFixed(0)}%` : 'N/A';
                     const passAtK = data.passAtK != null ? `${(data.passAtK * 100).toFixed(0)}%` : 'N/A';
                     const status = data.passAtK >= 0.8 ? '✅' : data.passAtK >= 0.6 ? '⚠️' : '❌';
-                    summary += `| ${taskName} | ${passRate} | ${passAtK} | ${status} |\n`;
+                    summary += `| ${taskName} | ${agent} | ${passRate} | ${passAtK} | ${status} |\n`;
                   } catch (e) {
-                    summary += `| ${file} | Error | Error | ❌ |\n`;
+                    summary += `| ${file} | — | Error | Error | ❌ |\n`;
                   }
                 }
 
                 summary += '\n### Thresholds\n';
-                summary += '- ✅ `pass@5 ≥ 80%` — merge gate passed\n';
-                summary += '- ⚠️ `pass@5 ≥ 60%` — needs investigation\n';
-                summary += '- ❌ `pass@5 < 60%` — blocks merge for affected skill\n';
+                summary += '- ✅ `pass@k ≥ 80%` — merge gate passed\n';
+                summary += '- ⚠️ `pass@k ≥ 60%` — needs investigation\n';
+                summary += '- ❌ `pass@k < 60%` — blocks merge for affected skill\n';
               }
             } catch (e) {
               summary += `> ⚠️ Could not read results: ${e.message}\n`;
diff --git a/evals/README.md b/evals/README.md
@@ -7,7 +7,9 @@ architecture and extended with patterns from
 
 The infrastructure is **self-contained** — there are no external eval-framework
 dependencies. A lightweight shell runner (`run-eval.sh`) executes each task's
-reference solution and deterministic grader.
+reference solution and deterministic grader, and can also dispatch tasks to
+AI coding agents (GitHub Copilot CLI or Google Gemini CLI) for end-to-end
+evaluation.
 
 ## Overview
 
@@ -32,6 +34,14 @@ Each task includes:
 
 - Bash 4+
 - `bc` (installed by default on most Linux / macOS systems)
+- Node.js 20+ (for config parsing and agent CLI installation)
+
+**For agent-based evaluation (optional):**
+
+| Agent | Install | Auth |
+|---|---|---|
+| GitHub Copilot | `npm install -g @github/copilot` | Active Copilot subscription; `GITHUB_TOKEN` env var |
+| Google Gemini | `npm install -g @google/gemini-cli` | `GEMINI_API_KEY` env var |
 
 ## Running Evals Locally
 
@@ -50,16 +60,73 @@ bash run-eval.sh --all --validate
 bash run-eval.sh grid-basic-setup --validate
 ```
 
+### Run evals against an AI agent
+
+Send the `instruction.md` to a coding agent CLI, let the agent generate code
+in an isolated workspace, then run the deterministic grader on the output.
+
+```bash
+cd evals
+
+# Run all tasks with GitHub Copilot CLI
+bash run-eval.sh --all --agent copilot
+
+# Run a single task with Gemini CLI
+bash run-eval.sh grid-basic-setup --agent gemini
+
+# Run 3 trials per task for statistical robustness
+bash run-eval.sh --all --agent copilot --trials 3
+```
+
 ### npm scripts (convenience wrappers)
 
 ```bash
 cd evals
+
+# Validation (reference solutions)
 npm run validate               # all tasks
 npm run validate:grid          # grid-basic-setup only
 npm run validate:combo         # component-combo-reactive-form only
 npm run validate:theming       # theming-palette-generation only
+
+# Agent-based evaluation
+npm run agent:copilot          # all tasks with Copilot
+npm run agent:copilot:grid     # grid task with Copilot
+npm run agent:gemini           # all tasks with Gemini
+npm run agent:gemini:theming   # theming task with Gemini
+```
+
+## Agent Configuration
+
+Agent settings are stored in `eval-config.json`:
+
+```json
+{
+  "defaultAgent": "copilot",
+  "agents": {
+    "copilot": {
+      "command": "copilot",
+      "installCommand": "npm install -g @github/copilot",
+      "promptArgs": ["-p"],
+      "autoApproveArgs": ["--yes"],
+      "envAuth": "GITHUB_TOKEN"
+    },
+    "gemini": {
+      "command": "gemini",
+      "installCommand": "npm install -g @google/gemini-cli",
+      "promptArgs": ["-p"],
+      "autoApproveArgs": ["--sandbox"],
+      "envAuth": "GEMINI_API_KEY"
+    }
+  },
+  "trialCount": 1,
+  "timeoutSec": 600
+}
 ```
 
+You can customize the agent command, flags, and timeouts by editing this file.
+To switch the default agent, change `defaultAgent`.
+
 ## Adding a New Task
 
 1. Create a directory under `evals/tasks/<task-id>/` with the standard structure:
@@ -95,25 +162,43 @@ npm run validate:theming       # theming-palette-generation only
    bash run-eval.sh <task-id> --validate
    ```
 
+7. Test against at least one agent:
+
+   ```bash
+   bash run-eval.sh <task-id> --agent copilot
+   ```
+
 ## Pass / Fail Thresholds
 
 Following [Anthropic's recommendations](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents):
 
 | Metric | Threshold | Effect |
 |---|---|---|
-| `pass@5 ≥ 80%` | **Merge gate** | At least 1 success in 5 trials required |
-| `pass^5 ≥ 60%` | **Tracked** | Flags flaky skills for investigation |
-| `pass@5 < 60%` | **Blocks merge** | On PRs touching the relevant skill |
+| `pass@k ≥ 80%` | **Merge gate** | At least 1 success in k trials required |
+| `pass@k ≥ 60%` | **Tracked** | Flags flaky skills for investigation |
+| `pass@k < 60%` | **Blocks merge** | On PRs touching the relevant skill |
 
 ## CI Integration
 
-The GitHub Actions workflow at `.github/workflows/skill-eval.yml` runs
-automatically on PRs that modify `skills/**` or `evals/**`. It:
+The GitHub Actions workflow at `.github/workflows/skill-eval.yml` provides two
+evaluation modes:
 
-1. Checks out the repo
-2. Validates all graders against their reference solutions
-3. Uploads results as an artifact
-4. Posts a summary comment on the PR
+### Automatic (on PR)
+Runs on every PR that modifies `skills/**` or `evals/**`:
+1. Validates all graders against their reference solutions
+2. Uploads results as an artifact
+3. Posts a summary comment on the PR
+
+### Manual (workflow_dispatch)
+Triggered manually from the Actions tab to run agent-based evaluation:
+1. Select the agent (`copilot` or `gemini`) and number of trials
+2. Installs the selected agent CLI
+3. Runs all tasks against the agent
+4. Uploads results as an artifact
+
+**Secrets required for agent-based CI:**
+- `GITHUB_TOKEN` — automatically available (for Copilot)
+- `GEMINI_API_KEY` — must be added as a repository secret (for Gemini)
 
 ## Grading Strategy
 
@@ -135,3 +220,7 @@ automatically on PRs that modify `skills/**` or `evals/**`. It:
 Baseline results are stored in `evals/results/baseline.json` and used for
 regression comparison on PRs. The CI workflow uploads per-run results as
 GitHub Actions artifacts.
+
+Agent-based results are suffixed with the agent name (e.g.,
+`grid-basic-setup-copilot.json`) to distinguish them from reference
+validation results.
diff --git a/evals/eval-config.json b/evals/eval-config.json
@@ -0,0 +1,23 @@
+{
+  "defaultAgent": "copilot",
+  "agents": {
+    "copilot": {
+      "command": "copilot",
+      "installCommand": "npm install -g @github/copilot",
+      "promptArgs": ["-p"],
+      "autoApproveArgs": ["--yes"],
+      "envAuth": "GITHUB_TOKEN",
+      "description": "GitHub Copilot CLI (requires active Copilot subscription)"
+    },
+    "gemini": {
+      "command": "gemini",
+      "installCommand": "npm install -g @google/gemini-cli",
+      "promptArgs": ["-p"],
+      "autoApproveArgs": ["--sandbox"],
+      "envAuth": "GEMINI_API_KEY",
+      "description": "Google Gemini CLI (requires GEMINI_API_KEY)"
+    }
+  },
+  "trialCount": 1,
+  "timeoutSec": 600
+}
diff --git a/evals/package.json b/evals/package.json
@@ -12,7 +12,15 @@
     "validate": "bash run-eval.sh --all --validate",
     "validate:grid": "bash run-eval.sh grid-basic-setup --validate",
     "validate:combo": "bash run-eval.sh component-combo-reactive-form --validate",
-    "validate:theming": "bash run-eval.sh theming-palette-generation --validate"
+    "validate:theming": "bash run-eval.sh theming-palette-generation --validate",
+    "agent:copilot": "bash run-eval.sh --all --agent copilot",
+    "agent:copilot:grid": "bash run-eval.sh grid-basic-setup --agent copilot",
+    "agent:copilot:combo": "bash run-eval.sh component-combo-reactive-form --agent copilot",
+    "agent:copilot:theming": "bash run-eval.sh theming-palette-generation --agent copilot",
+    "agent:gemini": "bash run-eval.sh --all --agent gemini",
+    "agent:gemini:grid": "bash run-eval.sh grid-basic-setup --agent gemini",
+    "agent:gemini:combo": "bash run-eval.sh component-combo-reactive-form --agent gemini",
+    "agent:gemini:theming": "bash run-eval.sh theming-palette-generation --agent gemini"
   },
   "engines": {
     "node": ">=20.0.0"
diff --git a/evals/run-eval.sh b/evals/run-eval.sh