Skip to content

Commit a7c8922

Browse files
authored
Added systemd oom handling and tests (#227)
* added systemd oom handling and tests * format * reduce restart loop to 1s * explicitly stdout/stderr * sudo * move health check to avoid deadlocking process * added changeset * updated as per comments * removed burst in jupyter.service too * updated env vars
1 parent f93189b commit a7c8922

9 files changed

Lines changed: 272 additions & 19 deletions

File tree

.changeset/empty-knives-make.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
'@e2b/code-interpreter-template': patch
3+
---
4+
5+
added systemd to handle process restarts

js/tests/systemd.test.ts

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import { expect } from 'vitest'
2+
import { sandboxTest, wait } from './setup'
3+
4+
async function waitForHealth(sandbox: any, maxRetries = 10, intervalMs = 100) {
5+
for (let i = 0; i < maxRetries; i++) {
6+
try {
7+
const result = await sandbox.commands.run(
8+
'curl -s -o /dev/null -w "%{http_code}" http://0.0.0.0:49999/health'
9+
)
10+
if (result.stdout.trim() === '200') {
11+
return true
12+
}
13+
} catch {
14+
// Connection refused or other error, retry
15+
}
16+
await wait(intervalMs)
17+
}
18+
return false
19+
}
20+
21+
sandboxTest('restart after jupyter kill', async ({ sandbox }) => {
22+
// Verify health is up initially
23+
const initialHealth = await waitForHealth(sandbox)
24+
expect(initialHealth).toBe(true)
25+
26+
// Kill the jupyter process as root
27+
// The command handle may get killed too (since killing jupyter cascades to code-interpreter),
28+
// so we catch the error.
29+
try {
30+
await sandbox.commands.run("kill -9 $(pgrep -f 'jupyter server')", {
31+
user: 'root',
32+
})
33+
} catch {
34+
// Expected — the kill cascade may terminate the command handle
35+
}
36+
37+
// Wait for systemd to restart both services
38+
const recovered = await waitForHealth(sandbox, 60, 500)
39+
expect(recovered).toBe(true)
40+
41+
// Verify code execution works after recovery
42+
const result = await sandbox.runCode('x = 1; x')
43+
expect(result.text).toEqual('1')
44+
})
45+
46+
sandboxTest('restart after code-interpreter kill', async ({ sandbox }) => {
47+
// Verify health is up initially
48+
const initialHealth = await waitForHealth(sandbox)
49+
expect(initialHealth).toBe(true)
50+
51+
// Kill the code-interpreter process as root
52+
try {
53+
await sandbox.commands.run("kill -9 $(pgrep -f 'uvicorn main:app')", {
54+
user: 'root',
55+
})
56+
} catch {
57+
// Expected — killing code-interpreter may terminate the command handle
58+
}
59+
60+
// Wait for systemd to restart it and health to come back
61+
const recovered = await waitForHealth(sandbox, 60, 500)
62+
expect(recovered).toBe(true)
63+
64+
// Verify code execution works after recovery
65+
const result = await sandbox.runCode('x = 1; x')
66+
expect(result.text).toEqual('1')
67+
})
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import asyncio
2+
3+
from e2b_code_interpreter.code_interpreter_async import AsyncSandbox
4+
5+
6+
async def wait_for_health(sandbox: AsyncSandbox, max_retries=10, interval_ms=100):
7+
for _ in range(max_retries):
8+
try:
9+
result = await sandbox.commands.run(
10+
'curl -s -o /dev/null -w "%{http_code}" http://0.0.0.0:49999/health'
11+
)
12+
if result.stdout.strip() == "200":
13+
return True
14+
except Exception:
15+
pass
16+
await asyncio.sleep(interval_ms / 1000)
17+
return False
18+
19+
20+
async def test_restart_after_jupyter_kill(async_sandbox: AsyncSandbox):
21+
# Verify health is up initially
22+
assert await wait_for_health(async_sandbox)
23+
24+
# Kill the jupyter process as root
25+
# The command handle may get killed too (killing jupyter cascades to code-interpreter),
26+
# so we catch the error.
27+
try:
28+
await async_sandbox.commands.run(
29+
"kill -9 $(pgrep -f 'jupyter server')", user="root"
30+
)
31+
except Exception:
32+
pass
33+
34+
# Wait for systemd to restart both services
35+
assert await wait_for_health(async_sandbox, 60, 500)
36+
37+
# Verify code execution works after recovery
38+
result = await async_sandbox.run_code("x = 1; x")
39+
assert result.text == "1"
40+
41+
42+
async def test_restart_after_code_interpreter_kill(async_sandbox: AsyncSandbox):
43+
# Verify health is up initially
44+
assert await wait_for_health(async_sandbox)
45+
46+
# Kill the code-interpreter process as root
47+
try:
48+
await async_sandbox.commands.run(
49+
"kill -9 $(pgrep -f 'uvicorn main:app')", user="root"
50+
)
51+
except Exception:
52+
pass
53+
54+
# Wait for systemd to restart it and health to come back
55+
assert await wait_for_health(async_sandbox, 60, 500)
56+
57+
# Verify code execution works after recovery
58+
result = await async_sandbox.run_code("x = 1; x")
59+
assert result.text == "1"

python/tests/sync/test_systemd.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import time
2+
3+
from e2b_code_interpreter.code_interpreter_sync import Sandbox
4+
5+
6+
def wait_for_health(sandbox: Sandbox, max_retries=10, interval_ms=100):
7+
for _ in range(max_retries):
8+
try:
9+
result = sandbox.commands.run(
10+
'curl -s -o /dev/null -w "%{http_code}" http://0.0.0.0:49999/health'
11+
)
12+
if result.stdout.strip() == "200":
13+
return True
14+
except Exception:
15+
pass
16+
time.sleep(interval_ms / 1000)
17+
return False
18+
19+
20+
def test_restart_after_jupyter_kill(sandbox: Sandbox):
21+
# Verify health is up initially
22+
assert wait_for_health(sandbox)
23+
24+
# Kill the jupyter process as root
25+
# The command handle may get killed too (killing jupyter cascades to code-interpreter),
26+
# so we catch the error.
27+
try:
28+
sandbox.commands.run("kill -9 $(pgrep -f 'jupyter server')", user="root")
29+
except Exception:
30+
pass
31+
32+
# Wait for systemd to restart both services
33+
assert wait_for_health(sandbox, 60, 500)
34+
35+
# Verify code execution works after recovery
36+
result = sandbox.run_code("x = 1; x")
37+
assert result.text == "1"
38+
39+
40+
def test_restart_after_code_interpreter_kill(sandbox: Sandbox):
41+
# Verify health is up initially
42+
assert wait_for_health(sandbox)
43+
44+
# Kill the code-interpreter process as root
45+
try:
46+
sandbox.commands.run("kill -9 $(pgrep -f 'uvicorn main:app')", user="root")
47+
except Exception:
48+
pass
49+
50+
# Wait for systemd to restart it and health to come back
51+
assert wait_for_health(sandbox, 60, 500)
52+
53+
# Verify code execution works after recovery
54+
result = sandbox.run_code("x = 1; x")
55+
assert result.text == "1"

template/jupyter-healthcheck.sh

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/bin/bash
2+
# Custom health check for Jupyter Server
3+
# Verifies the server is responsive via the /api/status endpoint
4+
5+
MAX_RETRIES=50
6+
RETRY_INTERVAL=0.2
7+
8+
for i in $(seq 1 $MAX_RETRIES); do
9+
status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8888/api/status")
10+
11+
if [ "$status_code" -eq 200 ]; then
12+
echo "Jupyter Server is healthy"
13+
exit 0
14+
fi
15+
16+
if [ $((i % 10)) -eq 0 ]; then
17+
echo "Waiting for Jupyter Server to become healthy... (attempt $i/$MAX_RETRIES)"
18+
fi
19+
sleep $RETRY_INTERVAL
20+
done
21+
22+
echo "Jupyter Server health check failed after $MAX_RETRIES attempts"
23+
exit 1

template/start-up.sh

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,16 @@
11
#!/bin/bash
22

3-
function start_jupyter_server() {
4-
counter=0
5-
response=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8888/api/status")
6-
while [[ ${response} -ne 200 ]]; do
7-
let counter++
8-
if ((counter % 20 == 0)); then
9-
echo "Waiting for Jupyter Server to start..."
10-
sleep 0.1
11-
fi
12-
13-
response=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8888/api/status")
14-
done
3+
function start_code_interpreter() {
4+
/root/.jupyter/jupyter-healthcheck.sh
5+
if [ $? -ne 0 ]; then
6+
echo "Jupyter Server failed to start, aborting."
7+
exit 1
8+
fi
159

1610
cd /root/.server/
1711
.venv/bin/uvicorn main:app --host 0.0.0.0 --port 49999 --workers 1 --no-access-log --no-use-colors --timeout-keep-alive 640
1812
}
1913

2014
echo "Starting Code Interpreter server..."
21-
start_jupyter_server &
15+
start_code_interpreter &
2216
MATPLOTLIBRC=/root/.config/matplotlib/.matplotlibrc jupyter server --IdentityProvider.token="" >/dev/null 2>&1
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
[Unit]
2+
Description=Code Interpreter Server
3+
Documentation=https://github.com/e2b-dev/code-interpreter
4+
Requires=jupyter.service
5+
After=jupyter.service
6+
PartOf=jupyter.service
7+
StartLimitBurst=0
8+
9+
[Service]
10+
Type=simple
11+
WorkingDirectory=/root/.server
12+
ExecStartPre=/root/.jupyter/jupyter-healthcheck.sh
13+
ExecStart=/root/.server/.venv/bin/uvicorn main:app --host 0.0.0.0 --port 49999 --workers 1 --no-access-log --no-use-colors --timeout-keep-alive 640
14+
Restart=on-failure
15+
RestartSec=1
16+
StandardOutput=journal
17+
StandardError=journal

template/systemd/jupyter.service

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
[Unit]
2+
Description=Jupyter Server
3+
Documentation=https://jupyter-server.readthedocs.io
4+
Wants=code-interpreter.service
5+
StartLimitBurst=0
6+
7+
[Service]
8+
Type=simple
9+
Environment=MATPLOTLIBRC=/root/.config/matplotlib/.matplotlibrc
10+
ExecStart=/usr/local/bin/jupyter server --IdentityProvider.token=""
11+
ExecStartPost=-/usr/bin/systemctl reset-failed code-interpreter
12+
Restart=on-failure
13+
RestartSec=1
14+
StandardOutput=null
15+
StandardError=journal

template/template.py

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,6 @@ def make_template(
1717
"PIP_DEFAULT_TIMEOUT": "100",
1818
"PIP_DISABLE_PIP_VERSION_CHECK": "1",
1919
"PIP_NO_CACHE_DIR": "1",
20-
"JUPYTER_CONFIG_PATH": "/root/.jupyter",
21-
"IPYTHON_CONFIG_PATH": "/root/.ipython",
22-
"SERVER_PATH": "/root/.server",
2320
"JAVA_VERSION": "11",
2421
"JAVA_HOME": "/usr/lib/jvm/jdk-${JAVA_VERSION}",
2522
"IJAVA_VERSION": "1.3.0",
@@ -110,14 +107,30 @@ def make_template(
110107
# Copy configuration files
111108
template = (
112109
template.copy("matplotlibrc", ".config/matplotlib/.matplotlibrc")
113-
.copy("start-up.sh", ".jupyter/start-up.sh")
114-
.run_cmd("chmod +x .jupyter/start-up.sh")
110+
.copy("jupyter-healthcheck.sh", ".jupyter/jupyter-healthcheck.sh")
111+
.run_cmd("chmod +x .jupyter/jupyter-healthcheck.sh")
115112
.copy("jupyter_server_config.py", ".jupyter/")
116113
.make_dir(".ipython/profile_default/startup")
117114
.copy("ipython_kernel_config.py", ".ipython/profile_default/")
118115
.copy("startup_scripts", ".ipython/profile_default/startup")
119116
)
120117

118+
if not is_docker:
119+
template = (
120+
template.copy(
121+
"systemd/jupyter.service", "/etc/systemd/system/jupyter.service"
122+
)
123+
.copy(
124+
"systemd/code-interpreter.service",
125+
"/etc/systemd/system/code-interpreter.service",
126+
)
127+
.run_cmd("systemctl daemon-reload")
128+
)
129+
else:
130+
template = template.copy("start-up.sh", ".jupyter/start-up.sh").run_cmd(
131+
"chmod +x .jupyter/start-up.sh"
132+
)
133+
121134
if is_docker:
122135
# create user user and /home/user
123136
template = template.run_cmd("useradd -m user")
@@ -130,6 +143,11 @@ def make_template(
130143

131144
template = template.set_user("user").set_workdir("/home/user")
132145

146+
if is_docker:
147+
start_cmd = "sudo /root/.jupyter/start-up.sh"
148+
else:
149+
start_cmd = "sudo systemctl start jupyter"
150+
133151
return template.set_start_cmd(
134-
"sudo /root/.jupyter/start-up.sh", wait_for_url("http://localhost:49999/health")
152+
start_cmd, wait_for_url("http://localhost:49999/health")
135153
)

0 commit comments

Comments
 (0)