-
Notifications
You must be signed in to change notification settings - Fork 19
Place inner container cgroups under the pod's cgroup tree #169
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -256,7 +256,12 @@ func dockerCmd() *cobra.Command { | |
| log.Debug(ctx, "starting dockerd", slog.F("args", args)) | ||
|
|
||
| blog.Info("Waiting for sysbox processes to startup...") | ||
| dockerd := background.New(ctx, log, "dockerd", dargs...) | ||
| wrapCmd, wrapArgs := wrapDockerdCmd(dargs) | ||
| dockerd := background.New(ctx, log, wrapCmd, wrapArgs...) | ||
| // The unshare wrapper exec's into dockerd, so /proc/<pid>/cmdline | ||
| // reflects "dockerd" not "unshare". Track exits against the | ||
| // post-exec name to keep Restart/kill detection accurate. | ||
| dockerd.SetBinName(dockerdBinName) | ||
| err = dockerd.Start() | ||
| if err != nil { | ||
| return xerrors.Errorf("start dockerd: %w", err) | ||
|
|
@@ -289,12 +294,16 @@ func dockerCmd() *cobra.Command { | |
| log.Fatal(ctx, "dockerd exited, failed getting args for restart", slog.Error(err)) | ||
| } | ||
|
|
||
| err = dockerd.Restart(ctx, "dockerd", args...) | ||
| wrapCmd, wrapArgs := wrapDockerdCmd(args) | ||
| err = dockerd.Restart(ctx, wrapCmd, wrapArgs...) | ||
| if err != nil { | ||
| blog.Info("Failed to create Container-based Virtual Machine: " + err.Error()) | ||
| //nolint | ||
| log.Fatal(ctx, "restart dockerd", slog.Error(err)) | ||
| } | ||
| // Restart resets the tracked binary name to its `cmd` | ||
| // argument; re-set it for the wrapped invocation. | ||
| dockerd.SetBinName(dockerdBinName) | ||
|
|
||
| err = <-dockerd.Wait() | ||
| } | ||
|
|
@@ -357,10 +366,14 @@ func dockerCmd() *cobra.Command { | |
|
|
||
| log.Debug(ctx, "restarting dockerd", slog.F("args", args)) | ||
|
|
||
| err = dockerd.Restart(ctx, "dockerd", args...) | ||
| wrapCmd, wrapArgs := wrapDockerdCmd(args) | ||
| err = dockerd.Restart(ctx, wrapCmd, wrapArgs...) | ||
| if err != nil { | ||
| return xerrors.Errorf("restart dockerd: %w", err) | ||
| } | ||
| // Restart resets the tracked binary name to its `cmd` | ||
| // argument; re-set it for the wrapped invocation. | ||
| dockerd.SetBinName(dockerdBinName) | ||
| go func() { | ||
| err = <-dockerd.Wait() | ||
| blog.Errorf("restarted dockerd exited: %v", err) | ||
|
|
@@ -881,6 +894,93 @@ func dockerdArgs(link, cidr string, isNoSpace bool) ([]string, error) { | |
| return args, nil | ||
| } | ||
|
|
||
| // dockerdBinName is the binary name reported by /proc/<pid>/cmdline once the | ||
| // `unshare` -> `/bin/sh` -> dockerd exec chain set up by wrapDockerdCmd has | ||
| // completed. background.Process tracking must use this name (not the literal | ||
| // "unshare" command we invoke) so that exit detection compares against the | ||
| // running cmdline correctly. | ||
| const dockerdBinName = "dockerd" | ||
|
|
||
| // dockerdSubtreeControlMaxAttempts caps the cgroup-subtree-control retry loop | ||
| // so a stuck race can't block envbox startup indefinitely. | ||
| const dockerdSubtreeControlMaxAttempts = 100 | ||
|
Comment on lines
+904
to
+906
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Worth mentioning this is a divergence from the upstream implementation. (I think we should keep it though) |
||
|
|
||
| // wrapDockerdCmd wraps the dockerd invocation with `unshare --cgroup`, plus a | ||
| // cgroup2 remount and cgroupv2 delegation setup, so that inner container | ||
| // cgroups become descendants of the envbox container's own cgroup on the | ||
| // host. Without this the inner Docker daemon places container cgroups at | ||
| // /sys/fs/cgroup/docker/<id>, which is a sibling of the pod's cgroup tree | ||
| // rather than a descendant; host-level cgroup-aware tools (Tetragon, Falco, | ||
| // custom eBPF agents) then cannot attribute processes inside a workspace | ||
| // container back to the pod that's running them. | ||
| // | ||
| // On cgroupv2 the delegation block (move all current processes into | ||
| // /sys/fs/cgroup/init, then enable cgroup.subtree_control on the root) is | ||
| // required: cgroupv2's "no internal processes" rule otherwise prevents | ||
| // dockerd from creating child cgroups with processes under a parent that | ||
| // already has its own processes. The block is gated on | ||
| // /sys/fs/cgroup/cgroup.controllers existing so that on cgroupv1 hosts the | ||
| // wrapper is effectively a no-op around `unshare --cgroup -- exec dockerd`. | ||
| // | ||
| // The delegation logic mirrors moby's hack/dind wrapper (lines 61-79): | ||
| // https://github.com/moby/moby/blob/8d9e3502aba39127e4d12196dae16d306f76993d/hack/dind#L61-L79 | ||
| // | ||
| // Note: the `umount /sys/fs/cgroup && mount -t cgroup2 ...` leaks back into | ||
| // envbox's mount namespace because we don't use `--mount` on unshare. Using | ||
| // `--mount` would break sysbox-fs propagation (its per-container mounts | ||
| // under /var/lib/sysboxfs/ stop being visible to sysbox-runc in the dockerd | ||
| // namespace). The observable side effect of the leak is that envbox's later | ||
| // cgroupv2 cpu.max read uses a different path; xunix.readCPUQuotaCGroupV2 | ||
| // has a fallback that handles this. | ||
| // | ||
| // See also: https://github.com/moby/moby/issues/45378#issuecomment-2886261231 | ||
| func wrapDockerdCmd(dargs []string) (string, []string) { | ||
| shellCmd := fmt.Sprintf(`set -e | ||
| # cgroup v2: enable nesting | ||
| if [ -f /sys/fs/cgroup/cgroup.controllers ]; then | ||
| # Remount /sys/fs/cgroup so the new cgroup namespace's view becomes the | ||
| # fs root. Inner container cgroups will then be created under the envbox | ||
| # container's cgroup on the host. | ||
| umount /sys/fs/cgroup | ||
| mount -t cgroup2 cgroup /sys/fs/cgroup | ||
|
Comment on lines
+944
to
+945
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion: |
||
|
|
||
| # move the processes from the root group to the /init group, | ||
| # otherwise writing subtree_control fails with EBUSY. | ||
| # An error during moving non-existent process (i.e., "cat") is ignored. | ||
| mkdir -p /sys/fs/cgroup/init | ||
| # this happens in a loop because things like "docker exec" on our dind | ||
| # container will create new processes, which creates a race between our | ||
| # moving everything to "init" and enabling subtree_control | ||
| envbox_attempts=0 | ||
| while ! { | ||
| # move the processes from the root group to the /init group, | ||
| # otherwise writing subtree_control fails with EBUSY. | ||
| # An error during moving non-existent process (i.e., "cat") is ignored. | ||
| xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs || : | ||
| # enable controllers | ||
| sed -e 's/ / +/g' -e 's/^/+/' < /sys/fs/cgroup/cgroup.controllers \ | ||
| > /sys/fs/cgroup/cgroup.subtree_control | ||
| }; do | ||
| envbox_attempts=$((envbox_attempts + 1)) | ||
| if [ "$envbox_attempts" -ge %d ]; then | ||
| echo "envbox: failed to enable cgroup.subtree_control after $envbox_attempts attempts" >&2 | ||
| exit 1 | ||
| fi | ||
| done | ||
| fi | ||
| exec "$0" "$@" | ||
|
Comment on lines
+938
to
+971
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion: use |
||
| `, dockerdSubtreeControlMaxAttempts) | ||
| wrapperArgs := []string{ | ||
| "--cgroup", | ||
| "/bin/sh", | ||
| "-c", | ||
| shellCmd, | ||
| dockerdBinName, | ||
| } | ||
| wrapperArgs = append(wrapperArgs, dargs...) | ||
| return "unshare", wrapperArgs | ||
| } | ||
|
|
||
| // TODO This is bad code. | ||
| func filterElements(ss []string, filters ...string) []string { | ||
| filtered := make([]string, 0, len(ss)) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -178,18 +178,23 @@ func TestDocker(t *testing.T) { | |
| fmt.Sprintf("--bridge-cidr=%s", bridgeCIDR), | ||
| ) | ||
|
|
||
| dockerdArgs := []string{ | ||
| "--debug", | ||
| "--log-level=debug", | ||
| fmt.Sprintf("--mtu=%d", nl.Attrs().MTU), | ||
| "--userns-remap=coder", | ||
| "--storage-driver=overlay2", | ||
| fmt.Sprintf("--bip=%s", bridgeCIDR), | ||
| } | ||
| // dockerd is launched via an unshare wrapper that exec's into | ||
| // dockerd with these args; assert against the full wrapped argv. | ||
| wrapCmd, wrapArgs := cli.WrapDockerdCmd(dockerdArgs) | ||
|
Comment on lines
+189
to
+191
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This makes the test tautological. You should instead explicitly specify the expected |
||
| expectedArgv := append([]string{wrapCmd}, wrapArgs...) | ||
|
|
||
| execer := clitest.Execer(ctx) | ||
| execer.AddCommands(&xunixfake.FakeCmd{ | ||
| FakeCmd: &testingexec.FakeCmd{ | ||
| Argv: []string{ | ||
| "dockerd", | ||
| "--debug", | ||
| "--log-level=debug", | ||
| fmt.Sprintf("--mtu=%d", nl.Attrs().MTU), | ||
| "--userns-remap=coder", | ||
| "--storage-driver=overlay2", | ||
| fmt.Sprintf("--bip=%s", bridgeCIDR), | ||
| }, | ||
| Argv: expectedArgv, | ||
| }, | ||
| }) | ||
|
|
||
|
|
@@ -741,6 +746,42 @@ func TestDocker(t *testing.T) { | |
| }) | ||
| } | ||
|
|
||
| func TestWrapDockerdCmd(t *testing.T) { | ||
| t.Parallel() | ||
|
|
||
| dargs := []string{"--debug", "--mtu=1500"} | ||
| cmd, args := cli.WrapDockerdCmd(dargs) | ||
|
|
||
| // The wrapper invokes `unshare`, exec'ing through `/bin/sh -c <script>` | ||
| // and finally exec'ing dockerd. /proc/<pid>/cmdline ends up as "dockerd", | ||
| // which is what background.Process tracking should compare against. | ||
| require.Equal(t, "unshare", cmd) | ||
| require.Equal(t, "dockerd", cli.DockerdBinName) | ||
|
|
||
| // Argv prefix: --cgroup /bin/sh -c <script> dockerd <dargs...> | ||
| require.GreaterOrEqual(t, len(args), 6, "args=%v", args) | ||
| require.Equal(t, "--cgroup", args[0]) | ||
| require.Equal(t, "/bin/sh", args[1]) | ||
| require.Equal(t, "-c", args[2]) | ||
| require.Equal(t, cli.DockerdBinName, args[4]) | ||
| require.Equal(t, dargs, args[5:]) | ||
|
|
||
| // The shell script should: | ||
| // - guard the v2-only block on cgroup.controllers existing | ||
| // - perform the umount/mount inside that guard (cgroupv1 hosts unaffected) | ||
| // - delegate via /init + subtree_control | ||
| // - bound the retry loop | ||
| // - exec into dockerd | ||
| script := args[3] | ||
| require.Contains(t, script, "[ -f /sys/fs/cgroup/cgroup.controllers ]") | ||
| require.Contains(t, script, "umount /sys/fs/cgroup") | ||
| require.Contains(t, script, "mount -t cgroup2 cgroup /sys/fs/cgroup") | ||
| require.Contains(t, script, "mkdir -p /sys/fs/cgroup/init") | ||
| require.Contains(t, script, "/sys/fs/cgroup/cgroup.subtree_control") | ||
| require.Contains(t, script, fmt.Sprintf("ge %d ]", cli.DockerdSubtreeControlMaxAttempts)) | ||
| require.Contains(t, script, `exec "$0" "$@"`) | ||
| } | ||
|
|
||
| // rawDockerAuth is sample input for a kubernetes secret to a gcr.io private | ||
| // registry. | ||
| const rawDockerAuth = `{"auths":{"us.gcr.io":{"username":"_json_key","password":"{\"type\": \"service_account\", \"project_id\": \"some-test\", \"private_key_id\": \"blahblah\", \"private_key\": \"-----BEGIN PRIVATE KEY-----mykey-----END PRIVATE KEY-----\", \"client_email\": \"test@test.iam.gserviceaccount.com\", \"client_id\": \"123\", \"auth_uri\": \"https://accounts.google.com/o/oauth2/auth\", \"token_uri\": \"https://oauth2.googleapis.com/token\", \"auth_provider_x509_cert_url\": \"https://www.googleapis.com/oauth2/v1/certs\", \"client_x509_cert_url\": \"https://www.googleapis.com/robot/v1/metadata/x509/test.iam.gserviceaccount.com\" }","email":"test@test.iam.gserviceaccount.com","auth":"X2pzb25fa2V5OnsKCgkidHlwZSI6ICJzZXJ2aWNlX2FjY291bnQiLAoJInByb2plY3RfaWQiOiAic29tZS10ZXN0IiwKCSJwcml2YXRlX2tleV9pZCI6ICJibGFoYmxhaCIsCgkicHJpdmF0ZV9rZXkiOiAiLS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCm15a2V5LS0tLS1FTkQgUFJJVkFURSBLRVktLS0tLQoiLAoJImNsaWVudF9lbWFpbCI6ICJ0ZXN0QHRlc3QuaWFtLmdzZXJ2aWNlYWNjb3VudC5jb20iLAoJImNsaWVudF9pZCI6ICIxMjMiLAoJImF1dGhfdXJpIjogImh0dHBzOi8vYWNjb3VudHMuZ29vZ2xlLmNvbS9vL29hdXRoMi9hdXRoIiwKCSJ0b2tlbl91cmkiOiAiaHR0cHM6Ly9vYXV0aDIuZ29vZ2xlYXBpcy5jb20vdG9rZW4iLAoJImF1dGhfcHJvdmlkZXJfeDUwOV9jZXJ0X3VybCI6ICJodHRwczovL3d3dy5nb29nbGVhcGlzLmNvbS9vYXV0aDIvdjEvY2VydHMiLAoJImNsaWVudF94NTA5X2NlcnRfdXJsIjogImh0dHBzOi8vd3d3Lmdvb2dsZWFwaXMuY29tL3JvYm90L3YxL21ldGFkYXRhL3g1MDkvdGVzdC5pYW0uZ3NlcnZpY2VhY2NvdW50LmNvbSIKfQo="}}}` | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,8 @@ | ||
| package cli | ||
|
|
||
| // Aliases to expose internal helpers to the external _test package. | ||
| var ( | ||
| WrapDockerdCmd = wrapDockerdCmd | ||
| DockerdBinName = dockerdBinName | ||
| DockerdSubtreeControlMaxAttempts = dockerdSubtreeControlMaxAttempts | ||
| ) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After every restart, callers need to remember to set this otherwise it gets lost.
Would it make sense to instead make this a required parameter?