@@ -49,7 +49,7 @@ type Runner struct {
4949func newRunner (agentSource config.Source , runConfig * config.RuntimeConfig , judgeModel provider.Provider , cfg Config ) * Runner {
5050 var judge * Judge
5151 if judgeModel != nil {
52- judge = NewJudge (judgeModel , runConfig , cfg .Concurrency )
52+ judge = NewJudge (judgeModel , cfg .Concurrency )
5353 }
5454 return & Runner {
5555 Config : cfg ,
@@ -117,6 +117,20 @@ func (r *Runner) Run(ctx context.Context, ttyOut, out io.Writer, isTTY bool) ([]
117117 return nil , fmt .Errorf ("loading evaluations: %w" , err )
118118 }
119119
120+ // Check whether any evaluations require relevance checking.
121+ // If so, the judge must be configured and working; validate eagerly
122+ // to fail fast on configuration issues (bad API key, wrong model, etc.)
123+ // instead of silently producing zero-relevance results.
124+ if needsJudge (evals ) {
125+ if r .judge == nil {
126+ return nil , errors .New ("some evaluations have relevance criteria but no judge model is configured (use --judge-model)" )
127+ }
128+ fmt .Fprintln (out , "Validating judge model..." )
129+ if err := r .judge .Validate (ctx ); err != nil {
130+ return nil , fmt .Errorf ("%w" , err )
131+ }
132+ }
133+
120134 // Pre-build all unique Docker images in parallel before running evaluations.
121135 // This avoids serialized builds when multiple workers need the same image.
122136 if err := r .preBuildImages (ctx , out , evals ); err != nil {
@@ -336,17 +350,15 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
336350 result .ToolCallsScore = toolCallF1Score (expectedToolCalls , actualToolCalls )
337351 }
338352
339- result .HandoffsMatch = countHandoffs (expectedToolCalls ) == countHandoffs (actualToolCalls )
340-
341353 if r .judge != nil && len (evals .Relevance ) > 0 {
342354 // Use transcript for relevance checking to preserve temporal ordering
343355 transcript := buildTranscript (events )
344- passed , failed , errs := r .judge .CheckRelevance (ctx , transcript , evals .Relevance )
356+ passed , failed , err := r .judge .CheckRelevance (ctx , transcript , evals .Relevance )
357+ if err != nil {
358+ return result , fmt .Errorf ("relevance check failed: %w" , err )
359+ }
345360 result .RelevancePassed = float64 (passed )
346361 result .FailedRelevance = failed
347- for _ , e := range errs {
348- slog .Warn ("Relevance check error" , "title" , evalSess .Title , "error" , e )
349- }
350362 }
351363
352364 slog .Debug ("Evaluation complete" , "title" , evalSess .Title , "duration" , time .Since (startTime ))
@@ -590,6 +602,14 @@ func matchesAnyPattern(name string, patterns []string) bool {
590602 })
591603}
592604
605+ // needsJudge returns true if any evaluation session has relevance criteria,
606+ // meaning a judge model is required to evaluate them.
607+ func needsJudge (evals []InputSession ) bool {
608+ return slices .ContainsFunc (evals , func (s InputSession ) bool {
609+ return s .Evals != nil && len (s .Evals .Relevance ) > 0
610+ })
611+ }
612+
593613// createJudgeModel creates a provider.Provider from a model string (format: provider/model).
594614// Returns nil if judgeModel is empty.
595615func createJudgeModel (ctx context.Context , judgeModel string , runConfig * config.RuntimeConfig ) (provider.Provider , error ) {
@@ -602,7 +622,10 @@ func createJudgeModel(ctx context.Context, judgeModel string, runConfig *config.
602622 return nil , fmt .Errorf ("invalid judge model format %q: expected 'provider/model'" , judgeModel )
603623 }
604624
605- var opts []options.Opt
625+ opts := []options.Opt {
626+ options .WithThinking (false ),
627+ options .WithStructuredOutput (judgeResponseSchema ),
628+ }
606629 if runConfig .ModelsGateway != "" {
607630 opts = append (opts , options .WithGateway (runConfig .ModelsGateway ))
608631 }
0 commit comments