Skip to content

Commit bd55840

Browse files
authored
Merge pull request #2100 from dgageot/improve-evals-5
Improve evals
2 parents 0d76412 + e8e5ba6 commit bd55840

File tree

9 files changed

+241
-197
lines changed

9 files changed

+241
-197
lines changed

docs/features/evaluation/index.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,6 @@ docker-agent evaluates agents across four dimensions:
130130
| **Tool Calls (F1)** | F1 score between the expected tool call sequence (from the recorded session) and the actual tool calls made by the agent. |
131131
| **Relevance** | An LLM judge (configurable via `--judge-model`) evaluates whether each relevance statement is satisfied by the response. |
132132
| **Size** | Whether the response length matches the expected size category (S/M/L/XL). |
133-
| **Handoffs** | For multi-agent configs, whether task delegation matched the expected agent handoff pattern. |
134133

135134
## Creating Eval Sessions
136135

@@ -192,7 +191,6 @@ $ docker agent eval demo.yaml ./evals
192191
Summary: 2/2 passed
193192
Sizes: 0/0
194193
Tool Calls: avg F1 1.00 (2 evals)
195-
Handoffs: 2/2
196194
Relevance: 3/3
197195

198196
Sessions DB: ./evals/results/happy-panda-1234.db

pkg/evaluation/eval.go

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ type Runner struct {
4949
func newRunner(agentSource config.Source, runConfig *config.RuntimeConfig, judgeModel provider.Provider, cfg Config) *Runner {
5050
var judge *Judge
5151
if judgeModel != nil {
52-
judge = NewJudge(judgeModel, runConfig, cfg.Concurrency)
52+
judge = NewJudge(judgeModel, cfg.Concurrency)
5353
}
5454
return &Runner{
5555
Config: cfg,
@@ -117,6 +117,20 @@ func (r *Runner) Run(ctx context.Context, ttyOut, out io.Writer, isTTY bool) ([]
117117
return nil, fmt.Errorf("loading evaluations: %w", err)
118118
}
119119

120+
// Check whether any evaluations require relevance checking.
121+
// If so, the judge must be configured and working; validate eagerly
122+
// to fail fast on configuration issues (bad API key, wrong model, etc.)
123+
// instead of silently producing zero-relevance results.
124+
if needsJudge(evals) {
125+
if r.judge == nil {
126+
return nil, errors.New("some evaluations have relevance criteria but no judge model is configured (use --judge-model)")
127+
}
128+
fmt.Fprintln(out, "Validating judge model...")
129+
if err := r.judge.Validate(ctx); err != nil {
130+
return nil, fmt.Errorf("%w", err)
131+
}
132+
}
133+
120134
// Pre-build all unique Docker images in parallel before running evaluations.
121135
// This avoids serialized builds when multiple workers need the same image.
122136
if err := r.preBuildImages(ctx, out, evals); err != nil {
@@ -336,17 +350,15 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
336350
result.ToolCallsScore = toolCallF1Score(expectedToolCalls, actualToolCalls)
337351
}
338352

339-
result.HandoffsMatch = countHandoffs(expectedToolCalls) == countHandoffs(actualToolCalls)
340-
341353
if r.judge != nil && len(evals.Relevance) > 0 {
342354
// Use transcript for relevance checking to preserve temporal ordering
343355
transcript := buildTranscript(events)
344-
passed, failed, errs := r.judge.CheckRelevance(ctx, transcript, evals.Relevance)
356+
passed, failed, err := r.judge.CheckRelevance(ctx, transcript, evals.Relevance)
357+
if err != nil {
358+
return result, fmt.Errorf("relevance check failed: %w", err)
359+
}
345360
result.RelevancePassed = float64(passed)
346361
result.FailedRelevance = failed
347-
for _, e := range errs {
348-
slog.Warn("Relevance check error", "title", evalSess.Title, "error", e)
349-
}
350362
}
351363

352364
slog.Debug("Evaluation complete", "title", evalSess.Title, "duration", time.Since(startTime))
@@ -590,6 +602,14 @@ func matchesAnyPattern(name string, patterns []string) bool {
590602
})
591603
}
592604

605+
// needsJudge returns true if any evaluation session has relevance criteria,
606+
// meaning a judge model is required to evaluate them.
607+
func needsJudge(evals []InputSession) bool {
608+
return slices.ContainsFunc(evals, func(s InputSession) bool {
609+
return s.Evals != nil && len(s.Evals.Relevance) > 0
610+
})
611+
}
612+
593613
// createJudgeModel creates a provider.Provider from a model string (format: provider/model).
594614
// Returns nil if judgeModel is empty.
595615
func createJudgeModel(ctx context.Context, judgeModel string, runConfig *config.RuntimeConfig) (provider.Provider, error) {
@@ -602,7 +622,10 @@ func createJudgeModel(ctx context.Context, judgeModel string, runConfig *config.
602622
return nil, fmt.Errorf("invalid judge model format %q: expected 'provider/model'", judgeModel)
603623
}
604624

605-
var opts []options.Opt
625+
opts := []options.Opt{
626+
options.WithThinking(false),
627+
options.WithStructuredOutput(judgeResponseSchema),
628+
}
606629
if runConfig.ModelsGateway != "" {
607630
opts = append(opts, options.WithGateway(runConfig.ModelsGateway))
608631
}

0 commit comments

Comments
 (0)