Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 65 additions & 7 deletions src/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,37 @@ import {ContextInfo} from '@docker/actions-toolkit/lib/types/docker/docker';
import * as context from './context';
import * as stateHelper from './state-helper';

/**
* Retry a function with exponential backoff
*/
async function retryWithBackoff<T>(
operation: () => Promise<T>,
maxRetries: number = 3,
initialDelay: number = 1000,
maxDelay: number = 10000,
shouldRetry: (error: Error) => boolean = () => true
): Promise<T> {
let retries = 0;
let delay = initialDelay;

while (true) {
try {
return await operation();
} catch (error) {
if (retries >= maxRetries || !shouldRetry(error)) {
throw error;
}

retries++;
core.info(`Retry ${retries}/${maxRetries} after ${delay}ms due to: ${error.message}`);
await new Promise(resolve => setTimeout(resolve, delay));

// Exponential backoff with jitter
delay = Math.min(delay * 2, maxDelay) * (0.8 + Math.random() * 0.4);
}
}
}

actionsToolkit.run(
// main
async () => {
Expand Down Expand Up @@ -165,13 +196,40 @@ actionsToolkit.run(

await core.group(`Booting builder`, async () => {
const inspectCmd = await toolkit.buildx.getCommand(await context.getInspectArgs(inputs, toolkit));
await Exec.getExecOutput(inspectCmd.command, inspectCmd.args, {
ignoreReturnCode: true
}).then(res => {
if (res.stderr.length > 0 && res.exitCode != 0) {
throw new Error(res.stderr.match(/(.*)\s*$/)?.[0]?.trim() ?? 'unknown error');
}
});

try {
await retryWithBackoff(
async () => {
// Create a promise that will timeout after 15 seconds
const timeoutPromise = new Promise<never>((_, reject) => {
setTimeout(() => {
reject(new Error('Timeout exceeded while waiting for buildkit to initialize'));
}, 15000); // 15 second timeout
});
Comment on lines +203 to +208
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need this timeout promise, inspect command already have some timeout: https://github.com/docker/buildx/blob/ea2b7020a4645bff395eb49e4e87ef08ba24eb93/commands/inspect.go#L38-L40

Copy link
Author

@danielamar101-pton danielamar101-pton May 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This timeout is the core of the change however, as we found sometimes this command execution can hang during high resource contention, in particular IO bottlenecks, in our infrastructure.

This approach is a classic example of the "end-to-end principle" in distributed systems - reliability mechanisms at lower layers can't always account for failures at higher layers, so an additional timeout at the highest level provides more comprehensive protection against various failure modes across the entire stack.


// Create the actual command execution promise
const execPromise = Exec.getExecOutput(inspectCmd.command, inspectCmd.args, {
ignoreReturnCode: true
}).then(res => {
if (res.stderr.length > 0 && res.exitCode != 0) {
throw new Error(res.stderr.match(/(.*)\s*$/)?.[0]?.trim() ?? 'unknown error');
}
return res;
});

// Race the timeout against the actual command
// If the command takes too long, we'll get the timeout error instead
return Promise.race([execPromise, timeoutPromise]);
},
3, // maxRetries - retry up to 3 times for buildkit initialization
1000, // initialDelay - start with 1 second
15000 // maxDelay - cap at 15 seconds
);
} catch (error) {
// Log the warning but continue - this matches current behavior where builds still succeed
core.warning(`Failed to bootstrap builder after multiple retries: ${error.message}`);
core.warning('Continuing execution as buildkit daemon may initialize later');
}
});

if (inputs.install) {
Expand Down