coder · jamestoyer · Apr 27, 2026 · May 6, 2026 · johnstcn · May 7, 2026
diff --git a/background/process.go b/background/process.go
@@ -56,6 +56,24 @@ func (d *Process) Start() error {
 	return d.startProcess()
 }
 
+// SetBinName overrides the binary name used to detect whether the running
+// process has exited (via the contents of /proc/<pid>/cmdline). By default
+// the binary name is the `cmd` argument passed to New or Restart. When the
+// command is a wrapper that exec's into a different binary (e.g.
+// `unshare ... -- /bin/sh -c 'exec dockerd ...'`), the post-exec cmdline
+// reflects the final binary rather than the wrapper, so the default
+// binary name will not match. Call SetBinName with the post-exec binary
+// name to keep exit detection accurate.
+//
+// SetBinName must be called again after each Restart since Restart resets
+// the binary name to its `cmd` argument.
+func (d *Process) SetBinName(name string) {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	d.binName = name
+}
+
 // Wait waits for the process to exit, returning the error on the provided
 // channel.
 func (d *Process) Wait() <-chan error {

diff --git a/cli/docker.go b/cli/docker.go
@@ -256,7 +256,12 @@ func dockerCmd() *cobra.Command {
 			log.Debug(ctx, "starting dockerd", slog.F("args", args))
 
 			blog.Info("Waiting for sysbox processes to startup...")
-			dockerd := background.New(ctx, log, "dockerd", dargs...)
+			wrapCmd, wrapArgs := wrapDockerdCmd(dargs)
+			dockerd := background.New(ctx, log, wrapCmd, wrapArgs...)
+			// The unshare wrapper exec's into dockerd, so /proc/<pid>/cmdline
+			// reflects "dockerd" not "unshare". Track exits against the
+			// post-exec name to keep Restart/kill detection accurate.
+			dockerd.SetBinName(dockerdBinName)
 			err = dockerd.Start()
 			if err != nil {
 				return xerrors.Errorf("start dockerd: %w", err)
@@ -289,12 +294,16 @@ func dockerCmd() *cobra.Command {
 						log.Fatal(ctx, "dockerd exited, failed getting args for restart", slog.Error(err))
 					}
 
-					err = dockerd.Restart(ctx, "dockerd", args...)
+					wrapCmd, wrapArgs := wrapDockerdCmd(args)
+					err = dockerd.Restart(ctx, wrapCmd, wrapArgs...)
 					if err != nil {
 						blog.Info("Failed to create Container-based Virtual Machine: " + err.Error())
 						//nolint
 						log.Fatal(ctx, "restart dockerd", slog.Error(err))
 					}
+					// Restart resets the tracked binary name to its `cmd`
+					// argument; re-set it for the wrapped invocation.
+					dockerd.SetBinName(dockerdBinName)
 
 					err = <-dockerd.Wait()
 				}
@@ -357,10 +366,14 @@ func dockerCmd() *cobra.Command {
 
 					log.Debug(ctx, "restarting dockerd", slog.F("args", args))
 
-					err = dockerd.Restart(ctx, "dockerd", args...)
+					wrapCmd, wrapArgs := wrapDockerdCmd(args)
+					err = dockerd.Restart(ctx, wrapCmd, wrapArgs...)
 					if err != nil {
 						return xerrors.Errorf("restart dockerd: %w", err)
 					}
+					// Restart resets the tracked binary name to its `cmd`
+					// argument; re-set it for the wrapped invocation.
+					dockerd.SetBinName(dockerdBinName)
 					go func() {
 						err = <-dockerd.Wait()
 						blog.Errorf("restarted dockerd exited: %v", err)
@@ -881,6 +894,93 @@ func dockerdArgs(link, cidr string, isNoSpace bool) ([]string, error) {
 	return args, nil
 }
 
+// dockerdBinName is the binary name reported by /proc/<pid>/cmdline once the
+// `unshare` -> `/bin/sh` -> dockerd exec chain set up by wrapDockerdCmd has
+// completed. background.Process tracking must use this name (not the literal
+// "unshare" command we invoke) so that exit detection compares against the
+// running cmdline correctly.
+const dockerdBinName = "dockerd"
+
+// dockerdSubtreeControlMaxAttempts caps the cgroup-subtree-control retry loop
+// so a stuck race can't block envbox startup indefinitely.
+const dockerdSubtreeControlMaxAttempts = 100
+
+// wrapDockerdCmd wraps the dockerd invocation with `unshare --cgroup`, plus a
+// cgroup2 remount and cgroupv2 delegation setup, so that inner container
+// cgroups become descendants of the envbox container's own cgroup on the
+// host. Without this the inner Docker daemon places container cgroups at
+// /sys/fs/cgroup/docker/<id>, which is a sibling of the pod's cgroup tree
+// rather than a descendant; host-level cgroup-aware tools (Tetragon, Falco,
+// custom eBPF agents) then cannot attribute processes inside a workspace
+// container back to the pod that's running them.
+//
+// On cgroupv2 the delegation block (move all current processes into
+// /sys/fs/cgroup/init, then enable cgroup.subtree_control on the root) is
+// required: cgroupv2's "no internal processes" rule otherwise prevents
+// dockerd from creating child cgroups with processes under a parent that
+// already has its own processes. The block is gated on
+// /sys/fs/cgroup/cgroup.controllers existing so that on cgroupv1 hosts the
+// wrapper is effectively a no-op around `unshare --cgroup -- exec dockerd`.
+//
+// The delegation logic mirrors moby's hack/dind wrapper (lines 61-79):
+// https://github.com/moby/moby/blob/8d9e3502aba39127e4d12196dae16d306f76993d/hack/dind#L61-L79
+//
+// Note: the `umount /sys/fs/cgroup && mount -t cgroup2 ...` leaks back into
+// envbox's mount namespace because we don't use `--mount` on unshare. Using
+// `--mount` would break sysbox-fs propagation (its per-container mounts
+// under /var/lib/sysboxfs/ stop being visible to sysbox-runc in the dockerd
+// namespace). The observable side effect of the leak is that envbox's later
+// cgroupv2 cpu.max read uses a different path; xunix.readCPUQuotaCGroupV2
+// has a fallback that handles this.
+//
+// See also: https://github.com/moby/moby/issues/45378#issuecomment-2886261231
+func wrapDockerdCmd(dargs []string) (string, []string) {
+	shellCmd := fmt.Sprintf(`set -e
+# cgroup v2: enable nesting
+if [ -f /sys/fs/cgroup/cgroup.controllers ]; then
+	# Remount /sys/fs/cgroup so the new cgroup namespace's view becomes the
+	# fs root. Inner container cgroups will then be created under the envbox
+	# container's cgroup on the host.
+	umount /sys/fs/cgroup
+	mount -t cgroup2 cgroup /sys/fs/cgroup
+
+	# move the processes from the root group to the /init group,
+	# otherwise writing subtree_control fails with EBUSY.
+	# An error during moving non-existent process (i.e., "cat") is ignored.
+	mkdir -p /sys/fs/cgroup/init
+	# this happens in a loop because things like "docker exec" on our dind
+	# container will create new processes, which creates a race between our
+	# moving everything to "init" and enabling subtree_control
+	envbox_attempts=0
+	while ! {
+		# move the processes from the root group to the /init group,
+		# otherwise writing subtree_control fails with EBUSY.
+		# An error during moving non-existent process (i.e., "cat") is ignored.
+		xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs || :
+		# enable controllers
+		sed -e 's/ / +/g' -e 's/^/+/' < /sys/fs/cgroup/cgroup.controllers \
+			> /sys/fs/cgroup/cgroup.subtree_control
+	}; do
+		envbox_attempts=$((envbox_attempts + 1))
+		if [ "$envbox_attempts" -ge %d ]; then
+			echo "envbox: failed to enable cgroup.subtree_control after $envbox_attempts attempts" >&2
+			exit 1
+		fi
+	done
+fi
+exec "$0" "$@"
+`, dockerdSubtreeControlMaxAttempts)
+	wrapperArgs := []string{
+		"--cgroup",
+		"/bin/sh",
+		"-c",
+		shellCmd,
+		dockerdBinName,
+	}
+	wrapperArgs = append(wrapperArgs, dargs...)
+	return "unshare", wrapperArgs
+}
+
 // TODO This is bad code.
 func filterElements(ss []string, filters ...string) []string {
 	filtered := make([]string, 0, len(ss))

diff --git a/cli/docker_test.go b/cli/docker_test.go
@@ -178,18 +178,23 @@ func TestDocker(t *testing.T) {
 			fmt.Sprintf("--bridge-cidr=%s", bridgeCIDR),
 		)
 
+		dockerdArgs := []string{
+			"--debug",
+			"--log-level=debug",
+			fmt.Sprintf("--mtu=%d", nl.Attrs().MTU),
+			"--userns-remap=coder",
+			"--storage-driver=overlay2",
+			fmt.Sprintf("--bip=%s", bridgeCIDR),
+		}
+		// dockerd is launched via an unshare wrapper that exec's into
+		// dockerd with these args; assert against the full wrapped argv.
+		wrapCmd, wrapArgs := cli.WrapDockerdCmd(dockerdArgs)
+		expectedArgv := append([]string{wrapCmd}, wrapArgs...)
+
 		execer := clitest.Execer(ctx)
 		execer.AddCommands(&xunixfake.FakeCmd{
 			FakeCmd: &testingexec.FakeCmd{
-				Argv: []string{
-					"dockerd",
-					"--debug",
-					"--log-level=debug",
-					fmt.Sprintf("--mtu=%d", nl.Attrs().MTU),
-					"--userns-remap=coder",
-					"--storage-driver=overlay2",
-					fmt.Sprintf("--bip=%s", bridgeCIDR),
-				},
+				Argv: expectedArgv,
 			},
 		})
 
@@ -741,6 +746,42 @@ func TestDocker(t *testing.T) {
 	})
 }
 
+func TestWrapDockerdCmd(t *testing.T) {
+	t.Parallel()
+
+	dargs := []string{"--debug", "--mtu=1500"}
+	cmd, args := cli.WrapDockerdCmd(dargs)
+
+	// The wrapper invokes `unshare`, exec'ing through `/bin/sh -c <script>`
+	// and finally exec'ing dockerd. /proc/<pid>/cmdline ends up as "dockerd",
+	// which is what background.Process tracking should compare against.
+	require.Equal(t, "unshare", cmd)
+	require.Equal(t, "dockerd", cli.DockerdBinName)
+
+	// Argv prefix: --cgroup /bin/sh -c <script> dockerd <dargs...>
+	require.GreaterOrEqual(t, len(args), 6, "args=%v", args)
+	require.Equal(t, "--cgroup", args[0])
+	require.Equal(t, "/bin/sh", args[1])
+	require.Equal(t, "-c", args[2])
+	require.Equal(t, cli.DockerdBinName, args[4])
+	require.Equal(t, dargs, args[5:])
+
+	// The shell script should:
+	//  - guard the v2-only block on cgroup.controllers existing
+	//  - perform the umount/mount inside that guard (cgroupv1 hosts unaffected)
+	//  - delegate via /init + subtree_control
+	//  - bound the retry loop
+	//  - exec into dockerd
+	script := args[3]
+	require.Contains(t, script, "[ -f /sys/fs/cgroup/cgroup.controllers ]")
+	require.Contains(t, script, "umount /sys/fs/cgroup")
+	require.Contains(t, script, "mount -t cgroup2 cgroup /sys/fs/cgroup")
+	require.Contains(t, script, "mkdir -p /sys/fs/cgroup/init")
+	require.Contains(t, script, "/sys/fs/cgroup/cgroup.subtree_control")
+	require.Contains(t, script, fmt.Sprintf("ge %d ]", cli.DockerdSubtreeControlMaxAttempts))
+	require.Contains(t, script, `exec "$0" "$@"`)
+}
+
 // rawDockerAuth is sample input for a kubernetes secret to a gcr.io private
 // registry.
 const rawDockerAuth = `{"auths":{"us.gcr.io":{"username":"_json_key","password":"{\"type\": \"service_account\", \"project_id\": \"some-test\", \"private_key_id\": \"blahblah\", \"private_key\": \"-----BEGIN PRIVATE KEY-----mykey-----END PRIVATE KEY-----\", \"client_email\": \"test@test.iam.gserviceaccount.com\", \"client_id\": \"123\", \"auth_uri\": \"https://accounts.google.com/o/oauth2/auth\", \"token_uri\": \"https://oauth2.googleapis.com/token\", \"auth_provider_x509_cert_url\": \"https://www.googleapis.com/oauth2/v1/certs\", \"client_x509_cert_url\": \"https://www.googleapis.com/robot/v1/metadata/x509/test.iam.gserviceaccount.com\" }","email":"test@test.iam.gserviceaccount.com","auth":"X2pzb25fa2V5OnsKCgkidHlwZSI6ICJzZXJ2aWNlX2FjY291bnQiLAoJInByb2plY3RfaWQiOiAic29tZS10ZXN0IiwKCSJwcml2YXRlX2tleV9pZCI6ICJibGFoYmxhaCIsCgkicHJpdmF0ZV9rZXkiOiAiLS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCm15a2V5LS0tLS1FTkQgUFJJVkFURSBLRVktLS0tLQoiLAoJImNsaWVudF9lbWFpbCI6ICJ0ZXN0QHRlc3QuaWFtLmdzZXJ2aWNlYWNjb3VudC5jb20iLAoJImNsaWVudF9pZCI6ICIxMjMiLAoJImF1dGhfdXJpIjogImh0dHBzOi8vYWNjb3VudHMuZ29vZ2xlLmNvbS9vL29hdXRoMi9hdXRoIiwKCSJ0b2tlbl91cmkiOiAiaHR0cHM6Ly9vYXV0aDIuZ29vZ2xlYXBpcy5jb20vdG9rZW4iLAoJImF1dGhfcHJvdmlkZXJfeDUwOV9jZXJ0X3VybCI6ICJodHRwczovL3d3dy5nb29nbGVhcGlzLmNvbS9vYXV0aDIvdjEvY2VydHMiLAoJImNsaWVudF94NTA5X2NlcnRfdXJsIjogImh0dHBzOi8vd3d3Lmdvb2dsZWFwaXMuY29tL3JvYm90L3YxL21ldGFkYXRhL3g1MDkvdGVzdC5pYW0uZ3NlcnZpY2VhY2NvdW50LmNvbSIKfQo="}}}`
diff --git a/cli/export_test.go b/cli/export_test.go
@@ -0,0 +1,8 @@
+package cli
+
+// Aliases to expose internal helpers to the external _test package.
+var (
+	WrapDockerdCmd                   = wrapDockerdCmd
+	DockerdBinName                   = dockerdBinName
+	DockerdSubtreeControlMaxAttempts = dockerdSubtreeControlMaxAttempts
+)
diff --git a/xunix/sys.go b/xunix/sys.go
@@ -63,9 +63,21 @@ func readCPUQuotaCGroupV2(ctx context.Context) (CPUQuota, error) {
 		return CPUQuota{}, xerrors.Errorf("determine own cgroup: %w", err)
 	}
 
-	maxStr, err := afero.ReadFile(fs, filepath.Join("/sys/fs/cgroup/", self, "cpu.max"))
+	// Try the standard path (self-relative) first.
+	selfPath := filepath.Join("/sys/fs/cgroup/", self, "cpu.max")
+	maxStr, err := afero.ReadFile(fs, selfPath)
 	if err != nil {
-		return CPUQuota{}, xerrors.Errorf("read cpu.max outside container: %w", err)
+		// Fallback: read cpu.max at the mount root. This handles the case where
+		// `/sys/fs/cgroup/` has been remounted to be rooted at the current
+		// cgroup (e.g. after `unshare --cgroup` + `mount -t cgroup2`), so the
+		// self-relative path no longer exists — but the mount root IS the
+		// current cgroup and its cpu.max reflects the same limits.
+		const rootPath = "/sys/fs/cgroup/cpu.max"
+		rootMaxStr, rootErr := afero.ReadFile(fs, rootPath)
+		if rootErr != nil {
+			return CPUQuota{}, xerrors.Errorf("read cpu.max outside container (tried %s: %v; fallback %s: %w)", selfPath, err, rootPath, rootErr)
+		}
+		maxStr = rootMaxStr
 	}
 
 	list := strings.Split(string(bytes.TrimSpace(maxStr)), " ")

diff --git a/xunix/sys_test.go b/xunix/sys_test.go
@@ -59,6 +59,18 @@ func TestReadCPUQuota(t *testing.T) {
 			},
 			Expected: xunix.CPUQuota{Quota: -1, Period: 100000, CGroup: xunix.CGroupV2},
 		},
+		{
+			// Simulates the cgroup namespace + remount the dockerd wrapper
+			// performs: the self-relative path no longer exists but cpu.max
+			// at the mount root reflects the same limits.
+			Name:    "CGroupV2_RootFallback",
+			Subpath: "docker/dummy",
+			FS: map[string]string{
+				"/proc/self/cgroup":         "0::/kubepods/pod/container\n",
+				"/sys/fs/cgroup/cpu.max":    "150000 100000\n",
+			},
+			Expected: xunix.CPUQuota{Quota: 150000, Period: 100000, CGroup: xunix.CGroupV2},
+		},
 		{
 			Name:  "Empty",
 			FS:    map[string]string{},