+ @if (result.timings) {
+
+
Timings
+
+ -
+ Generate: {{(result.timings.generateDurationMs / 1000) | number:'1.1-2'}}s
+
+ -
+ Build: {{(result.timings.buildDurationMs / 1000) | number:'1.1-2'}}s
+
+ -
+ Repair: {{(result.timings.repairDurationMs / 1000) | number:'1.1-2'}}s
+
+
+
+ }
+
@if (result.testResult) {
Test Results
diff --git a/runner/orchestration/build-serve-test-loop.ts b/runner/orchestration/build-serve-test-loop.ts
index cc6b62e4..884c90c5 100644
--- a/runner/orchestration/build-serve-test-loop.ts
+++ b/runner/orchestration/build-serve-test-loop.ts
@@ -4,10 +4,12 @@ import {BuildResultStatus} from '../workers/builder/builder-types.js';
import {Environment} from '../configuration/environment.js';
import {
AssessmentConfig,
+ AssessmentTimings,
AttemptDetails,
LlmContextFile,
RootPromptDefinition,
} from '../shared-interfaces.js';
+import {performance} from 'node:perf_hooks';
import {ProgressLogger} from '../progress/progress-logger.js';
import {BuildType, runBuild} from './build-worker.js';
import {EvalID} from './executors/executor.js';
@@ -53,7 +55,9 @@ export async function attemptBuildAndTest(
workerConcurrencyQueue: PQueue,
progress: ProgressLogger,
userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined,
+ timings: AssessmentTimings,
) {
+ const initialBuildStart = performance.now();
const initialBuildResult = await runBuild(
evalID,
directory,
@@ -64,6 +68,7 @@ export async function attemptBuildAndTest(
progress,
BuildType.INITIAL_BUILD,
);
+ timings.buildDurationMs += performance.now() - initialBuildStart;
let repairAttempts = 0;
let maxRepairAttempts: number;
let maxTestRepairAttempts: number;
@@ -100,6 +105,7 @@ export async function attemptBuildAndTest(
`Trying to repair app build (attempt #${repairAttempts + 1})`,
);
+ const repairStart = performance.now();
const attempt = await repairAndBuild(
evalID,
config.model,
@@ -120,6 +126,7 @@ export async function attemptBuildAndTest(
progress,
'build',
);
+ timings.repairDurationMs += performance.now() - repairStart;
attemptDetails.push(attempt);
lastAttempt = attempt;
@@ -200,6 +207,7 @@ export async function attemptBuildAndTest(
});
}
+ const repairStart = performance.now();
const attempt = await repairAndBuild(
evalID,
config.model,
@@ -224,6 +232,7 @@ export async function attemptBuildAndTest(
// further repairs and capture the failed build. This is useful insight
// as LLMs seem to regress when asked to repair violations.
if (hasBuildFailure) {
+ timings.repairDurationMs += performance.now() - repairStart;
break;
}
@@ -249,6 +258,7 @@ export async function attemptBuildAndTest(
workerConcurrencyQueue,
progress,
)) ?? undefined;
+ timings.repairDurationMs += performance.now() - repairStart;
if (hasAxeFailure && lastAttempt.serveTestingResult?.axeViolations?.length === 0) {
progress.log(rootPromptDef, 'success', `Successfully fixed all Axe accessibility violations`);
diff --git a/runner/orchestration/generate-eval-task.ts b/runner/orchestration/generate-eval-task.ts
index 9229a50c..83484db5 100644
--- a/runner/orchestration/generate-eval-task.ts
+++ b/runner/orchestration/generate-eval-task.ts
@@ -18,6 +18,7 @@ import {rateGeneratedCode} from '../ratings/rate-code.js';
import {DEFAULT_AUTORATER_MODEL_NAME} from '../configuration/constants.js';
import assert from 'node:assert';
import {AiSdkRunner} from '../codegen/ai-sdk/ai-sdk-runner.js';
+import {performance} from 'node:perf_hooks';
/**
* Creates and executes a task to generate or load code for a given prompt,
@@ -60,6 +61,7 @@ export async function startEvaluationTask(
// and for each sub-prompt, because the project will be augmented on each iteration.
const contextFiles = await resolveContextFiles(promptDef.contextFilePatterns, directory);
+ const generateStart = performance.now();
// Generate the initial set of files through the LLM.
const initialResponse = await generateInitialFiles(
config,
@@ -76,6 +78,7 @@ export async function startEvaluationTask(
abortSignal,
progress,
);
+ const generateDurationMs = performance.now() - generateStart;
const toolLogs = initialResponse.toolLogs ?? [];
@@ -140,6 +143,7 @@ export async function startEvaluationTask(
}
const attemptDetails: AttemptDetails[] = []; // Store details for assessment.json
+ const timings = {generateDurationMs, buildDurationMs: 0, repairDurationMs: 0};
// Try to build the files in the root prompt directory.
// This will also attempt to fix issues with the generated code.
@@ -156,6 +160,7 @@ export async function startEvaluationTask(
workerConcurrencyQueue,
progress,
userJourneyAgentTaskInput,
+ timings,
);
if (!attempt) {
@@ -197,6 +202,7 @@ export async function startEvaluationTask(
toolLogs,
testResult: attempt.testResult ?? null,
testRepairAttempts: attempt.testRepairAttempts,
+ timings,
} satisfies AssessmentResult);
}
diff --git a/runner/ratings/stats.ts b/runner/ratings/stats.ts
index a97e927e..3f39261e 100644
--- a/runner/ratings/stats.ts
+++ b/runner/ratings/stats.ts
@@ -2,6 +2,7 @@ import {BuildErrorType, BuildResultStatus} from '../workers/builder/builder-type
import {UserFacingError} from '../utils/errors.js';
import {
AggregatedRunStats,
+ AggregatedTimings,
AssessmentResult,
RuntimeStats,
ScoreBucket,
@@ -15,6 +16,21 @@ export const BUCKET_CONFIG = [
{name: 'Poor', min: 0, max: 70, id: 'poor'},
];
+function calculateMean(values: number[]): number {
+ if (values.length === 0) return 0;
+ return values.reduce((sum, value) => sum + value, 0) / values.length;
+}
+
+function calculateMedian(values: number[]): number {
+ if (values.length === 0) return 0;
+ const sorted = [...values].sort((a, b) => a - b);
+ const middle = Math.floor(sorted.length / 2);
+ if (sorted.length % 2 === 0) {
+ return (sorted[middle - 1] + sorted[middle]) / 2;
+ }
+ return sorted[middle];
+}
+
/**
* Calculates build and check statistics from assessment results.
*
@@ -22,6 +38,10 @@ export const BUCKET_CONFIG = [
* @returns An object containing aggregated build and check statistics.
*/
export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): AggregatedRunStats {
+ const generateDurations: number[] = [];
+ const buildDurations: number[] = [];
+ const repairDurations: number[] = [];
+
let successfulInitialBuilds = 0;
let successfulBuildsAfterRepair = 0;
let failedBuilds = 0;
@@ -63,6 +83,12 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag
}
}
+ if (result.timings) {
+ generateDurations.push(result.timings.generateDurationMs);
+ buildDurations.push(result.timings.buildDurationMs);
+ repairDurations.push(result.timings.repairDurationMs);
+ }
+
// Calculate test statistics
if (result.testResult) {
if (result.testResult.passed) {
@@ -158,6 +184,22 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag
: undefined,
accessibility: accessibilityStats,
security: securityStats,
+ ...(generateDurations.length > 0 && {
+ timings: {
+ generate: {
+ mean: calculateMean(generateDurations),
+ median: calculateMedian(generateDurations),
+ },
+ build: {
+ mean: calculateMean(buildDurations),
+ median: calculateMedian(buildDurations),
+ },
+ repair: {
+ mean: calculateMean(repairDurations),
+ median: calculateMedian(repairDurations),
+ },
+ },
+ }),
};
}
diff --git a/runner/shared-interfaces.ts b/runner/shared-interfaces.ts
index 9a61c19f..b7139ed6 100644
--- a/runner/shared-interfaces.ts
+++ b/runner/shared-interfaces.ts
@@ -235,6 +235,20 @@ export interface SkippedIndividualAssessment {
groupingLabels?: string[];
}
+/** Stores the duration in milliseconds for different phases of the evaluation. */
+export interface AssessmentTimings {
+ generateDurationMs: number;
+ buildDurationMs: number;
+ repairDurationMs: number;
+}
+
+/** Stores aggregated timing statistics. */
+export interface AggregatedTimings {
+ generate: {mean: number; median: number};
+ build: {mean: number; median: number};
+ repair: {mean: number; median: number};
+}
+
/**
* Represents the overall score and breakdown of code assessments.
*/
@@ -345,6 +359,9 @@ export interface AggregatedRunStats {
appsWithoutErrors: number;
};
security?: {appsWithErrors: number; appsWithoutErrors: number};
+
+ /** Timing statistics for the run. */
+ timings?: AggregatedTimings;
}
export interface CompletionStats {
@@ -555,6 +572,8 @@ export interface AssessmentResult {
testResult: TestExecutionResult | null;
/** Number of repair attempts for tests. */
testRepairAttempts?: number;
+ /** Timings captured for the execution and repair stages. */
+ timings?: AssessmentTimings;
}
/**