Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions report-app/src/app/pages/report-viewer/report-viewer.html
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,25 @@ <h3 class="chart-title">
}
</div>

@if (overview.stats.timings) {
@let timings = overview.stats.timings;
<h3>Timings</h3>
<ul class="status-badge-group">
<li class="status-badge neutral">
<b>Generate:</b> {{(timings.generate.mean / 1000) | number:'1.1-2'}}s (mean), {{(timings.generate.median / 1000) |
number:'1.1-2'}}s (median)
</li>
<li class="status-badge neutral">
<b>Build:</b> {{(timings.build.mean / 1000) | number:'1.1-2'}}s (mean), {{(timings.build.median / 1000) |
number:'1.1-2'}}s (median)
</li>
<li class="status-badge neutral">
<b>Repair:</b> {{(timings.repair.mean / 1000) | number:'1.1-2'}}s (mean), {{(timings.repair.median / 1000) |
number:'1.1-2'}}s (median)
</li>
</ul>
}

@if (details) {
<h3>Usage Details</h3>
<ul class="status-badge-group">
Expand Down Expand Up @@ -466,6 +485,23 @@ <h5>
</div>
</div>

@if (result.timings) {
<div class="app-details-section">
<h4>Timings</h4>
<ul class="status-badge-group">
<li class="status-badge neutral">
<b>Generate:</b> {{(result.timings.generateDurationMs / 1000) | number:'1.1-2'}}s
</li>
<li class="status-badge neutral">
<b>Build:</b> {{(result.timings.buildDurationMs / 1000) | number:'1.1-2'}}s
</li>
<li class="status-badge neutral">
<b>Repair:</b> {{(result.timings.repairDurationMs / 1000) | number:'1.1-2'}}s
</li>
</ul>
</div>
}

@if (result.testResult) {
<div class="app-details-section">
<h4>Test Results</h4>
Expand Down
10 changes: 10 additions & 0 deletions runner/orchestration/build-serve-test-loop.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ import {BuildResultStatus} from '../workers/builder/builder-types.js';
import {Environment} from '../configuration/environment.js';
import {
AssessmentConfig,
AssessmentTimings,
AttemptDetails,
LlmContextFile,
RootPromptDefinition,
} from '../shared-interfaces.js';
import {performance} from 'node:perf_hooks';
import {ProgressLogger} from '../progress/progress-logger.js';
import {BuildType, runBuild} from './build-worker.js';
import {EvalID} from './executors/executor.js';
Expand Down Expand Up @@ -53,7 +55,9 @@ export async function attemptBuildAndTest(
workerConcurrencyQueue: PQueue,
progress: ProgressLogger,
userJourneyAgentTaskInput: BrowserAgentTaskInput | undefined,
timings: AssessmentTimings,
) {
const initialBuildStart = performance.now();
const initialBuildResult = await runBuild(
evalID,
directory,
Expand All @@ -64,6 +68,7 @@ export async function attemptBuildAndTest(
progress,
BuildType.INITIAL_BUILD,
);
timings.buildDurationMs += performance.now() - initialBuildStart;
let repairAttempts = 0;
let maxRepairAttempts: number;
let maxTestRepairAttempts: number;
Expand Down Expand Up @@ -100,6 +105,7 @@ export async function attemptBuildAndTest(
`Trying to repair app build (attempt #${repairAttempts + 1})`,
);

const repairStart = performance.now();
const attempt = await repairAndBuild(
evalID,
config.model,
Expand All @@ -120,6 +126,7 @@ export async function attemptBuildAndTest(
progress,
'build',
);
timings.repairDurationMs += performance.now() - repairStart;

attemptDetails.push(attempt);
lastAttempt = attempt;
Expand Down Expand Up @@ -200,6 +207,7 @@ export async function attemptBuildAndTest(
});
}

const repairStart = performance.now();
const attempt = await repairAndBuild(
evalID,
config.model,
Expand All @@ -224,6 +232,7 @@ export async function attemptBuildAndTest(
// further repairs and capture the failed build. This is useful insight
// as LLMs seem to regress when asked to repair violations.
if (hasBuildFailure) {
timings.repairDurationMs += performance.now() - repairStart;
break;
}

Expand All @@ -249,6 +258,7 @@ export async function attemptBuildAndTest(
workerConcurrencyQueue,
progress,
)) ?? undefined;
timings.repairDurationMs += performance.now() - repairStart;

if (hasAxeFailure && lastAttempt.serveTestingResult?.axeViolations?.length === 0) {
progress.log(rootPromptDef, 'success', `Successfully fixed all Axe accessibility violations`);
Expand Down
6 changes: 6 additions & 0 deletions runner/orchestration/generate-eval-task.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import {rateGeneratedCode} from '../ratings/rate-code.js';
import {DEFAULT_AUTORATER_MODEL_NAME} from '../configuration/constants.js';
import assert from 'node:assert';
import {AiSdkRunner} from '../codegen/ai-sdk/ai-sdk-runner.js';
import {performance} from 'node:perf_hooks';

/**
* Creates and executes a task to generate or load code for a given prompt,
Expand Down Expand Up @@ -60,6 +61,7 @@ export async function startEvaluationTask(
// and for each sub-prompt, because the project will be augmented on each iteration.
const contextFiles = await resolveContextFiles(promptDef.contextFilePatterns, directory);

const generateStart = performance.now();
// Generate the initial set of files through the LLM.
const initialResponse = await generateInitialFiles(
config,
Expand All @@ -76,6 +78,7 @@ export async function startEvaluationTask(
abortSignal,
progress,
);
const generateDurationMs = performance.now() - generateStart;

const toolLogs = initialResponse.toolLogs ?? [];

Expand Down Expand Up @@ -140,6 +143,7 @@ export async function startEvaluationTask(
}

const attemptDetails: AttemptDetails[] = []; // Store details for assessment.json
const timings = {generateDurationMs, buildDurationMs: 0, repairDurationMs: 0};

// Try to build the files in the root prompt directory.
// This will also attempt to fix issues with the generated code.
Expand All @@ -156,6 +160,7 @@ export async function startEvaluationTask(
workerConcurrencyQueue,
progress,
userJourneyAgentTaskInput,
timings,
);

if (!attempt) {
Expand Down Expand Up @@ -197,6 +202,7 @@ export async function startEvaluationTask(
toolLogs,
testResult: attempt.testResult ?? null,
testRepairAttempts: attempt.testRepairAttempts,
timings,
} satisfies AssessmentResult);
}

Expand Down
42 changes: 42 additions & 0 deletions runner/ratings/stats.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import {BuildErrorType, BuildResultStatus} from '../workers/builder/builder-type
import {UserFacingError} from '../utils/errors.js';
import {
AggregatedRunStats,
AggregatedTimings,
AssessmentResult,
RuntimeStats,
ScoreBucket,
Expand All @@ -15,13 +16,32 @@ export const BUCKET_CONFIG = [
{name: 'Poor', min: 0, max: 70, id: 'poor'},
];

function calculateMean(values: number[]): number {
if (values.length === 0) return 0;
return values.reduce((sum, value) => sum + value, 0) / values.length;
}

function calculateMedian(values: number[]): number {
if (values.length === 0) return 0;
const sorted = [...values].sort((a, b) => a - b);
const middle = Math.floor(sorted.length / 2);
if (sorted.length % 2 === 0) {
return (sorted[middle - 1] + sorted[middle]) / 2;
}
return sorted[middle];
}

/**
* Calculates build and check statistics from assessment results.
*
* @param assessments - An array of `AssessmentResult` objects.
* @returns An object containing aggregated build and check statistics.
*/
export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): AggregatedRunStats {
const generateDurations: number[] = [];
const buildDurations: number[] = [];
const repairDurations: number[] = [];

let successfulInitialBuilds = 0;
let successfulBuildsAfterRepair = 0;
let failedBuilds = 0;
Expand Down Expand Up @@ -63,6 +83,12 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag
}
}

if (result.timings) {
generateDurations.push(result.timings.generateDurationMs);
buildDurations.push(result.timings.buildDurationMs);
repairDurations.push(result.timings.repairDurationMs);
}

// Calculate test statistics
if (result.testResult) {
if (result.testResult.passed) {
Expand Down Expand Up @@ -158,6 +184,22 @@ export function calculateBuildAndCheckStats(assessments: AssessmentResult[]): Ag
: undefined,
accessibility: accessibilityStats,
security: securityStats,
...(generateDurations.length > 0 && {
timings: {
generate: {
mean: calculateMean(generateDurations),
median: calculateMedian(generateDurations),
},
build: {
mean: calculateMean(buildDurations),
median: calculateMedian(buildDurations),
},
repair: {
mean: calculateMean(repairDurations),
median: calculateMedian(repairDurations),
},
},
}),
};
}

Expand Down
19 changes: 19 additions & 0 deletions runner/shared-interfaces.ts
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,20 @@ export interface SkippedIndividualAssessment {
groupingLabels?: string[];
}

/** Stores the duration in milliseconds for different phases of the evaluation. */
export interface AssessmentTimings {
generateDurationMs: number;
buildDurationMs: number;
repairDurationMs: number;
}

/** Stores aggregated timing statistics. */
export interface AggregatedTimings {
generate: {mean: number; median: number};
build: {mean: number; median: number};
repair: {mean: number; median: number};
}

/**
* Represents the overall score and breakdown of code assessments.
*/
Expand Down Expand Up @@ -345,6 +359,9 @@ export interface AggregatedRunStats {
appsWithoutErrors: number;
};
security?: {appsWithErrors: number; appsWithoutErrors: number};

/** Timing statistics for the run. */
timings?: AggregatedTimings;
}

export interface CompletionStats {
Expand Down Expand Up @@ -555,6 +572,8 @@ export interface AssessmentResult {
testResult: TestExecutionResult | null;
/** Number of repair attempts for tests. */
testRepairAttempts?: number;
/** Timings captured for the execution and repair stages. */
timings?: AssessmentTimings;
}

/**
Expand Down
Loading