Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions background/process.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,24 @@ func (d *Process) Start() error {
return d.startProcess()
}

// SetBinName overrides the binary name used to detect whether the running
// process has exited (via the contents of /proc/<pid>/cmdline). By default
// the binary name is the `cmd` argument passed to New or Restart. When the
// command is a wrapper that exec's into a different binary (e.g.
// `unshare ... -- /bin/sh -c 'exec dockerd ...'`), the post-exec cmdline
// reflects the final binary rather than the wrapper, so the default
// binary name will not match. Call SetBinName with the post-exec binary
// name to keep exit detection accurate.
//
// SetBinName must be called again after each Restart since Restart resets
// the binary name to its `cmd` argument.
func (d *Process) SetBinName(name string) {
d.mu.Lock()
defer d.mu.Unlock()

d.binName = name
}
Comment on lines +70 to +75
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After every restart, callers need to remember to set this otherwise it gets lost.
Would it make sense to instead make this a required parameter?


// Wait waits for the process to exit, returning the error on the provided
// channel.
func (d *Process) Wait() <-chan error {
Expand Down
106 changes: 103 additions & 3 deletions cli/docker.go
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,12 @@ func dockerCmd() *cobra.Command {
log.Debug(ctx, "starting dockerd", slog.F("args", args))

blog.Info("Waiting for sysbox processes to startup...")
dockerd := background.New(ctx, log, "dockerd", dargs...)
wrapCmd, wrapArgs := wrapDockerdCmd(dargs)
dockerd := background.New(ctx, log, wrapCmd, wrapArgs...)
// The unshare wrapper exec's into dockerd, so /proc/<pid>/cmdline
// reflects "dockerd" not "unshare". Track exits against the
// post-exec name to keep Restart/kill detection accurate.
dockerd.SetBinName(dockerdBinName)
err = dockerd.Start()
if err != nil {
return xerrors.Errorf("start dockerd: %w", err)
Expand Down Expand Up @@ -289,12 +294,16 @@ func dockerCmd() *cobra.Command {
log.Fatal(ctx, "dockerd exited, failed getting args for restart", slog.Error(err))
}

err = dockerd.Restart(ctx, "dockerd", args...)
wrapCmd, wrapArgs := wrapDockerdCmd(args)
err = dockerd.Restart(ctx, wrapCmd, wrapArgs...)
if err != nil {
blog.Info("Failed to create Container-based Virtual Machine: " + err.Error())
//nolint
log.Fatal(ctx, "restart dockerd", slog.Error(err))
}
// Restart resets the tracked binary name to its `cmd`
// argument; re-set it for the wrapped invocation.
dockerd.SetBinName(dockerdBinName)

err = <-dockerd.Wait()
}
Expand Down Expand Up @@ -357,10 +366,14 @@ func dockerCmd() *cobra.Command {

log.Debug(ctx, "restarting dockerd", slog.F("args", args))

err = dockerd.Restart(ctx, "dockerd", args...)
wrapCmd, wrapArgs := wrapDockerdCmd(args)
err = dockerd.Restart(ctx, wrapCmd, wrapArgs...)
if err != nil {
return xerrors.Errorf("restart dockerd: %w", err)
}
// Restart resets the tracked binary name to its `cmd`
// argument; re-set it for the wrapped invocation.
dockerd.SetBinName(dockerdBinName)
go func() {
err = <-dockerd.Wait()
blog.Errorf("restarted dockerd exited: %v", err)
Expand Down Expand Up @@ -881,6 +894,93 @@ func dockerdArgs(link, cidr string, isNoSpace bool) ([]string, error) {
return args, nil
}

// dockerdBinName is the binary name reported by /proc/<pid>/cmdline once the
// `unshare` -> `/bin/sh` -> dockerd exec chain set up by wrapDockerdCmd has
// completed. background.Process tracking must use this name (not the literal
// "unshare" command we invoke) so that exit detection compares against the
// running cmdline correctly.
const dockerdBinName = "dockerd"

// dockerdSubtreeControlMaxAttempts caps the cgroup-subtree-control retry loop
// so a stuck race can't block envbox startup indefinitely.
const dockerdSubtreeControlMaxAttempts = 100
Comment on lines +904 to +906
Copy link
Copy Markdown
Member

@johnstcn johnstcn May 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Worth mentioning this is a divergence from the upstream implementation. (I think we should keep it though)


// wrapDockerdCmd wraps the dockerd invocation with `unshare --cgroup`, plus a
// cgroup2 remount and cgroupv2 delegation setup, so that inner container
// cgroups become descendants of the envbox container's own cgroup on the
// host. Without this the inner Docker daemon places container cgroups at
// /sys/fs/cgroup/docker/<id>, which is a sibling of the pod's cgroup tree
// rather than a descendant; host-level cgroup-aware tools (Tetragon, Falco,
// custom eBPF agents) then cannot attribute processes inside a workspace
// container back to the pod that's running them.
//
// On cgroupv2 the delegation block (move all current processes into
// /sys/fs/cgroup/init, then enable cgroup.subtree_control on the root) is
// required: cgroupv2's "no internal processes" rule otherwise prevents
// dockerd from creating child cgroups with processes under a parent that
// already has its own processes. The block is gated on
// /sys/fs/cgroup/cgroup.controllers existing so that on cgroupv1 hosts the
// wrapper is effectively a no-op around `unshare --cgroup -- exec dockerd`.
//
// The delegation logic mirrors moby's hack/dind wrapper (lines 61-79):
// https://github.com/moby/moby/blob/8d9e3502aba39127e4d12196dae16d306f76993d/hack/dind#L61-L79
//
// Note: the `umount /sys/fs/cgroup && mount -t cgroup2 ...` leaks back into
// envbox's mount namespace because we don't use `--mount` on unshare. Using
// `--mount` would break sysbox-fs propagation (its per-container mounts
// under /var/lib/sysboxfs/ stop being visible to sysbox-runc in the dockerd
// namespace). The observable side effect of the leak is that envbox's later
// cgroupv2 cpu.max read uses a different path; xunix.readCPUQuotaCGroupV2
// has a fallback that handles this.
//
// See also: https://github.com/moby/moby/issues/45378#issuecomment-2886261231
func wrapDockerdCmd(dargs []string) (string, []string) {
shellCmd := fmt.Sprintf(`set -e
# cgroup v2: enable nesting
if [ -f /sys/fs/cgroup/cgroup.controllers ]; then
# Remount /sys/fs/cgroup so the new cgroup namespace's view becomes the
# fs root. Inner container cgroups will then be created under the envbox
# container's cgroup on the host.
umount /sys/fs/cgroup
mount -t cgroup2 cgroup /sys/fs/cgroup
Comment on lines +944 to +945
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: set -e will cause this to abort with no context-specific error message, so consider something like

umount /sys/fs/cgroup || echo "envbox: failed to umount cgroup" >&2; exit 1;


# move the processes from the root group to the /init group,
# otherwise writing subtree_control fails with EBUSY.
# An error during moving non-existent process (i.e., "cat") is ignored.
mkdir -p /sys/fs/cgroup/init
# this happens in a loop because things like "docker exec" on our dind
# container will create new processes, which creates a race between our
# moving everything to "init" and enabling subtree_control
envbox_attempts=0
while ! {
# move the processes from the root group to the /init group,
# otherwise writing subtree_control fails with EBUSY.
# An error during moving non-existent process (i.e., "cat") is ignored.
xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs || :
# enable controllers
sed -e 's/ / +/g' -e 's/^/+/' < /sys/fs/cgroup/cgroup.controllers \
> /sys/fs/cgroup/cgroup.subtree_control
}; do
envbox_attempts=$((envbox_attempts + 1))
if [ "$envbox_attempts" -ge %d ]; then
echo "envbox: failed to enable cgroup.subtree_control after $envbox_attempts attempts" >&2
exit 1
fi
done
fi
exec "$0" "$@"
Comment on lines +938 to +971
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: use //go:embed to extract this to a standalone file. You then get syntax highlighting and the ability to test it independently.

`, dockerdSubtreeControlMaxAttempts)
wrapperArgs := []string{
"--cgroup",
"/bin/sh",
"-c",
shellCmd,
dockerdBinName,
}
wrapperArgs = append(wrapperArgs, dargs...)
return "unshare", wrapperArgs
}

// TODO This is bad code.
func filterElements(ss []string, filters ...string) []string {
filtered := make([]string, 0, len(ss))
Expand Down
59 changes: 50 additions & 9 deletions cli/docker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,18 +178,23 @@ func TestDocker(t *testing.T) {
fmt.Sprintf("--bridge-cidr=%s", bridgeCIDR),
)

dockerdArgs := []string{
"--debug",
"--log-level=debug",
fmt.Sprintf("--mtu=%d", nl.Attrs().MTU),
"--userns-remap=coder",
"--storage-driver=overlay2",
fmt.Sprintf("--bip=%s", bridgeCIDR),
}
// dockerd is launched via an unshare wrapper that exec's into
// dockerd with these args; assert against the full wrapped argv.
wrapCmd, wrapArgs := cli.WrapDockerdCmd(dockerdArgs)
Comment on lines +189 to +191
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes the test tautological. You should instead explicitly specify the expected argv.

expectedArgv := append([]string{wrapCmd}, wrapArgs...)

execer := clitest.Execer(ctx)
execer.AddCommands(&xunixfake.FakeCmd{
FakeCmd: &testingexec.FakeCmd{
Argv: []string{
"dockerd",
"--debug",
"--log-level=debug",
fmt.Sprintf("--mtu=%d", nl.Attrs().MTU),
"--userns-remap=coder",
"--storage-driver=overlay2",
fmt.Sprintf("--bip=%s", bridgeCIDR),
},
Argv: expectedArgv,
},
})

Expand Down Expand Up @@ -741,6 +746,42 @@ func TestDocker(t *testing.T) {
})
}

func TestWrapDockerdCmd(t *testing.T) {
t.Parallel()

dargs := []string{"--debug", "--mtu=1500"}
cmd, args := cli.WrapDockerdCmd(dargs)

// The wrapper invokes `unshare`, exec'ing through `/bin/sh -c <script>`
// and finally exec'ing dockerd. /proc/<pid>/cmdline ends up as "dockerd",
// which is what background.Process tracking should compare against.
require.Equal(t, "unshare", cmd)
require.Equal(t, "dockerd", cli.DockerdBinName)

// Argv prefix: --cgroup /bin/sh -c <script> dockerd <dargs...>
require.GreaterOrEqual(t, len(args), 6, "args=%v", args)
require.Equal(t, "--cgroup", args[0])
require.Equal(t, "/bin/sh", args[1])
require.Equal(t, "-c", args[2])
require.Equal(t, cli.DockerdBinName, args[4])
require.Equal(t, dargs, args[5:])

// The shell script should:
// - guard the v2-only block on cgroup.controllers existing
// - perform the umount/mount inside that guard (cgroupv1 hosts unaffected)
// - delegate via /init + subtree_control
// - bound the retry loop
// - exec into dockerd
script := args[3]
require.Contains(t, script, "[ -f /sys/fs/cgroup/cgroup.controllers ]")
require.Contains(t, script, "umount /sys/fs/cgroup")
require.Contains(t, script, "mount -t cgroup2 cgroup /sys/fs/cgroup")
require.Contains(t, script, "mkdir -p /sys/fs/cgroup/init")
require.Contains(t, script, "/sys/fs/cgroup/cgroup.subtree_control")
require.Contains(t, script, fmt.Sprintf("ge %d ]", cli.DockerdSubtreeControlMaxAttempts))
require.Contains(t, script, `exec "$0" "$@"`)
}

// rawDockerAuth is sample input for a kubernetes secret to a gcr.io private
// registry.
const rawDockerAuth = `{"auths":{"us.gcr.io":{"username":"_json_key","password":"{\"type\": \"service_account\", \"project_id\": \"some-test\", \"private_key_id\": \"blahblah\", \"private_key\": \"-----BEGIN PRIVATE KEY-----mykey-----END PRIVATE KEY-----\", \"client_email\": \"test@test.iam.gserviceaccount.com\", \"client_id\": \"123\", \"auth_uri\": \"https://accounts.google.com/o/oauth2/auth\", \"token_uri\": \"https://oauth2.googleapis.com/token\", \"auth_provider_x509_cert_url\": \"https://www.googleapis.com/oauth2/v1/certs\", \"client_x509_cert_url\": \"https://www.googleapis.com/robot/v1/metadata/x509/test.iam.gserviceaccount.com\" }","email":"test@test.iam.gserviceaccount.com","auth":"X2pzb25fa2V5OnsKCgkidHlwZSI6ICJzZXJ2aWNlX2FjY291bnQiLAoJInByb2plY3RfaWQiOiAic29tZS10ZXN0IiwKCSJwcml2YXRlX2tleV9pZCI6ICJibGFoYmxhaCIsCgkicHJpdmF0ZV9rZXkiOiAiLS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCm15a2V5LS0tLS1FTkQgUFJJVkFURSBLRVktLS0tLQoiLAoJImNsaWVudF9lbWFpbCI6ICJ0ZXN0QHRlc3QuaWFtLmdzZXJ2aWNlYWNjb3VudC5jb20iLAoJImNsaWVudF9pZCI6ICIxMjMiLAoJImF1dGhfdXJpIjogImh0dHBzOi8vYWNjb3VudHMuZ29vZ2xlLmNvbS9vL29hdXRoMi9hdXRoIiwKCSJ0b2tlbl91cmkiOiAiaHR0cHM6Ly9vYXV0aDIuZ29vZ2xlYXBpcy5jb20vdG9rZW4iLAoJImF1dGhfcHJvdmlkZXJfeDUwOV9jZXJ0X3VybCI6ICJodHRwczovL3d3dy5nb29nbGVhcGlzLmNvbS9vYXV0aDIvdjEvY2VydHMiLAoJImNsaWVudF94NTA5X2NlcnRfdXJsIjogImh0dHBzOi8vd3d3Lmdvb2dsZWFwaXMuY29tL3JvYm90L3YxL21ldGFkYXRhL3g1MDkvdGVzdC5pYW0uZ3NlcnZpY2VhY2NvdW50LmNvbSIKfQo="}}}`
8 changes: 8 additions & 0 deletions cli/export_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package cli

// Aliases to expose internal helpers to the external _test package.
var (
WrapDockerdCmd = wrapDockerdCmd
DockerdBinName = dockerdBinName
DockerdSubtreeControlMaxAttempts = dockerdSubtreeControlMaxAttempts
)
16 changes: 14 additions & 2 deletions xunix/sys.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,21 @@ func readCPUQuotaCGroupV2(ctx context.Context) (CPUQuota, error) {
return CPUQuota{}, xerrors.Errorf("determine own cgroup: %w", err)
}

maxStr, err := afero.ReadFile(fs, filepath.Join("/sys/fs/cgroup/", self, "cpu.max"))
// Try the standard path (self-relative) first.
selfPath := filepath.Join("/sys/fs/cgroup/", self, "cpu.max")
maxStr, err := afero.ReadFile(fs, selfPath)
if err != nil {
return CPUQuota{}, xerrors.Errorf("read cpu.max outside container: %w", err)
// Fallback: read cpu.max at the mount root. This handles the case where
// `/sys/fs/cgroup/` has been remounted to be rooted at the current
// cgroup (e.g. after `unshare --cgroup` + `mount -t cgroup2`), so the
// self-relative path no longer exists — but the mount root IS the
// current cgroup and its cpu.max reflects the same limits.
const rootPath = "/sys/fs/cgroup/cpu.max"
rootMaxStr, rootErr := afero.ReadFile(fs, rootPath)
if rootErr != nil {
return CPUQuota{}, xerrors.Errorf("read cpu.max outside container (tried %s: %v; fallback %s: %w)", selfPath, err, rootPath, rootErr)
}
maxStr = rootMaxStr
}

list := strings.Split(string(bytes.TrimSpace(maxStr)), " ")
Expand Down
12 changes: 12 additions & 0 deletions xunix/sys_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,18 @@ func TestReadCPUQuota(t *testing.T) {
},
Expected: xunix.CPUQuota{Quota: -1, Period: 100000, CGroup: xunix.CGroupV2},
},
{
// Simulates the cgroup namespace + remount the dockerd wrapper
// performs: the self-relative path no longer exists but cpu.max
// at the mount root reflects the same limits.
Name: "CGroupV2_RootFallback",
Subpath: "docker/dummy",
FS: map[string]string{
"/proc/self/cgroup": "0::/kubepods/pod/container\n",
"/sys/fs/cgroup/cpu.max": "150000 100000\n",
},
Expected: xunix.CPUQuota{Quota: 150000, Period: 100000, CGroup: xunix.CGroupV2},
},
{
Name: "Empty",
FS: map[string]string{},
Expand Down
Loading