Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions .agents/skills/helm-dev-environment/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,12 @@ mise run helm:k3s:create
```

Creates a k3d cluster and merges its kubeconfig into the worktree-local `kubeconfig` file.
Also applies the upstream agent-sandbox CRDs/controller (pinned via `AGENT_SANDBOX_VERSION`
in `tasks/scripts/helm-k3s-local.sh`, fetched from `github.com/kubernetes-sigs/agent-sandbox`
releases) and preloads the default community sandbox image into k3d so the first sandbox
create does not wait on a large registry pull. Traefik is disabled at cluster creation time.
Also applies the upstream agent-sandbox CRDs/controller plus the warm-pool extensions
(`SandboxTemplate` / `SandboxWarmPool` / `SandboxClaim`, from `extensions.yaml`) — both
pinned via `AGENT_SANDBOX_VERSION` in `tasks/scripts/helm-k3s-local.sh`, fetched from
`github.com/kubernetes-sigs/agent-sandbox` releases — and preloads the default community
sandbox image into k3d so the first sandbox create does not wait on a large registry pull.
Traefik is disabled at cluster creation time.

**Multi-worktree support:** the cluster name is derived from the last component of the
current git branch (e.g. branch `kube-support/local-dev/tmutch` → cluster
Expand Down
49 changes: 49 additions & 0 deletions crates/openshell-core/src/driver_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,55 @@ pub const LABEL_SANDBOX_NAME: &str = "openshell.ai/sandbox-name";
/// Container/pod label carrying the sandbox namespace.
pub const LABEL_SANDBOX_NAMESPACE: &str = "openshell.ai/sandbox-namespace";

// ---------------------------------------------------------------------------
// agent-sandbox CRD identity (group / version / kind / contract labels)
//
// These describe the upstream agent-sandbox CRDs the Kubernetes driver creates
// and that the gateway's `IssueSandboxToken` auth path re-anchors against. They
// are a cross-crate *security* contract: the ownerReference and claim-UID
// checks in `openshell-server` must agree byte-for-byte with what
// `openshell-driver-kubernetes` writes, so they live here as the single source
// of truth rather than being duplicated per crate.
// ---------------------------------------------------------------------------

/// API group of the core agent-sandbox `Sandbox` CRD.
pub const SANDBOX_CRD_GROUP: &str = "agents.x-k8s.io";

/// API version shared by the agent-sandbox CRDs (base and extensions).
pub const SANDBOX_CRD_VERSION: &str = "v1alpha1";

/// `apiVersion` (`group/version`) of the core `Sandbox` CRD.
pub const SANDBOX_CRD_API_VERSION: &str = "agents.x-k8s.io/v1alpha1";

/// Kind of the core agent-sandbox `Sandbox` CRD.
pub const SANDBOX_CRD_KIND: &str = "Sandbox";

/// API group of the warm-pool extension CRDs.
pub const SANDBOX_EXT_GROUP: &str = "extensions.agents.x-k8s.io";

/// `apiVersion` (`group/version`) of the warm-pool extension CRDs.
pub const SANDBOX_EXT_API_VERSION: &str = "extensions.agents.x-k8s.io/v1alpha1";

/// Kind of the warm-pool `SandboxClaim` extension CRD.
pub const SANDBOX_CLAIM_KIND: &str = "SandboxClaim";

/// Kind of the warm-pool `SandboxTemplate` extension CRD.
pub const SANDBOX_TEMPLATE_KIND: &str = "SandboxTemplate";

/// Kind of the warm-pool `SandboxWarmPool` extension CRD.
pub const SANDBOX_WARM_POOL_KIND: &str = "SandboxWarmPool";

/// Pod annotation carrying the per-sandbox identity (note the `.io` suffix —
/// distinct from the `.ai` [`LABEL_SANDBOX_ID`]). The gateway resolves a pod's
/// ServiceAccount token back to this value during `IssueSandboxToken`.
pub const SANDBOX_ID_ANNOTATION: &str = "openshell.io/sandbox-id";

/// Label the upstream warm-pool controller stamps onto a bound `Sandbox` CR
/// (and that the gateway sets on the `SandboxClaim`), carrying the binding
/// `SandboxClaim`'s apiserver-assigned UID. The auth path cross-checks it
/// against the controlling `SandboxClaim` ownerReference.
pub const CLAIM_UID_LABEL: &str = "agents.x-k8s.io/claim-uid";

// ---------------------------------------------------------------------------

/// Path to the sandbox supervisor binary inside the container image.
Expand Down
6 changes: 5 additions & 1 deletion crates/openshell-driver-docker/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1357,7 +1357,7 @@ impl ComputeDriver for DockerComputeDriver {
.sandbox
.ok_or_else(|| Status::invalid_argument("sandbox is required"))?;
self.create_sandbox_inner(&sandbox).await?;
Ok(Response::new(CreateSandboxResponse {}))
Ok(Response::new(CreateSandboxResponse { warm_claim: None }))
}

async fn stop_sandbox(
Expand Down Expand Up @@ -1482,6 +1482,8 @@ fn pending_sandbox_snapshot(
sandbox_fd: String::new(),
conditions: vec![condition],
deleting,
// Warm-pool claim fields apply to the Kubernetes driver only.
..Default::default()
}),
}
}
Expand Down Expand Up @@ -2624,6 +2626,8 @@ fn driver_status_from_summary(
last_transition_time: String::new(),
}],
deleting,
// Warm-pool claim fields apply to the Kubernetes driver only.
..Default::default()
}
}

Expand Down
1 change: 1 addition & 0 deletions crates/openshell-driver-docker/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ fn test_sandbox() -> DriverSandbox {
}),
gpu: false,
sandbox_token: String::new(),
disallow_warm_pool: false,
}),
status: None,
}
Expand Down
47 changes: 47 additions & 0 deletions crates/openshell-driver-kubernetes/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,53 @@ This is a stopgap persistence model. It preserves user files across pod
rescheduling but duplicates the base workspace and does not automatically apply
image updates to existing PVCs. Future snapshotting should replace it.

## Warm Pools

When `warm_pool.enabled` is set, the driver pre-declares one operator-owned
`SandboxTemplate` + `SandboxWarmPool` per configured pool (extension CRDs
`extensions.agents.x-k8s.io/v1alpha1`) and satisfies a matching `CreateSandbox`
by creating a `SandboxClaim` that binds a pre-warmed pod (~0.1s) instead of
cold-creating a `Sandbox`. Only the trusted `default_image` with no per-request
template or env overrides is pooled; everything else takes the cold path.

The pooled `SandboxTemplate` bakes the shared pod blueprint (image, mTLS mount,
projected SA token, supervisor sideload, capabilities, runtimeClass, optional
read-only shared data volume) but carries **no per-sandbox identity**. The
template sets `networkPolicyManagement: Unmanaged` — OpenShell enforces egress
itself (supervisor proxy + Landlock), and the controller's default `Managed`
mode would otherwise impose a rule-less, default-deny `NetworkPolicy` that blocks
the pod's own egress to the gateway. The writable `/sandbox` workspace is an
ephemeral `emptyDir` seeded from the image: single-use and fail-safe (the kubelet
reclaims it with the pod, so there is nothing to orphan). Claims set
`shutdownPolicy: Delete` so teardown cascades to the bound `Sandbox`/Pod, and a
claimed pod is never returned to the pool.

### Identity and policy on the warm path

A pooled pod boots **before** its claim assigns an identity, so the supervisor
cannot fetch a per-sandbox policy or assert a sandbox-id at startup. Instead it:

- boots on the image's **baseline policy** (Landlock is applied once at process
start and cannot be loosened later, so the baseline is what a pooled pod
enforces);
- runs with `OPENSHELL_SANDBOX_ID` unset, skipping identity-gated startup;
- establishes its relay session with an empty `hello.sandbox_id`; the gateway
derives the identity from the gateway-minted JWT obtained via
`IssueSandboxToken` (resolved server-side from the claim-injected
`openshell.io/sandbox-id` annotation + the durable claim mapping). The
supervisor retries the bootstrap at a steady cadence until the claim binds.

Because a pooled pod can only enforce the baseline policy, a `CreateSandbox`
request that carries a **custom policy** is never warm-pooled — the gateway sets
`DriverSandboxSpec.disallow_warm_pool`, and such requests take the cold path so
their policy is applied faithfully (no silent downgrade). Claim `env` injection
is likewise disallowed.

The driver surfaces a warm sandbox to the gateway by watching `SandboxClaim`s
(correlated to the gateway sandbox-id via the `openshell.ai/sandbox-id` label it
stamps on each claim) and returns the bound claim's `(name, uid)` so the gateway
can record the durable claim → sandbox-id mapping used by the auth re-anchor.

## Credentials, TLS, and Relay

The driver injects gateway callback configuration, sandbox identity, TLS client
Expand Down
115 changes: 115 additions & 0 deletions crates/openshell-driver-kubernetes/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,68 @@ pub struct KubernetesComputeConfig {
deserialize_with = "deserialize_provider_spiffe_workload_api_socket_path"
)]
pub provider_spiffe_workload_api_socket_path: String,
/// Warm-pool configuration. When enabled, the gateway pre-declares one or
/// more operator-owned `SandboxWarmPool`s and satisfies matching
/// `CreateSandbox` requests by binding a pre-warmed pod via a `SandboxClaim`
/// instead of cold-creating a `Sandbox`. Off by default.
#[serde(default)]
pub warm_pool: KubernetesWarmPoolConfig,
}

/// Warm-pool configuration for the Kubernetes driver.
///
/// Only **operator-declared** pools using the gateway's trusted default image
/// are warm-pooled. Requests for a user-supplied image or that carry a custom
/// template fall back to the cold path — per-tenant pool isolation for
/// untrusted images is out of scope (see RFC 0005 / issue #1879).
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(default, deny_unknown_fields)]
pub struct KubernetesWarmPoolConfig {
/// Master switch. When false the driver never creates extension CRDs and
/// every request takes the cold path.
pub enabled: bool,
/// Declared pools. Each maps to exactly one (image, runtimeClass, gpu)
/// shape; the image is always the driver `default_image`.
pub pools: Vec<WarmPoolSpec>,
}

/// One operator-declared warm pool.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(deny_unknown_fields)]
pub struct WarmPoolSpec {
/// Pool name. Also names the backing `SandboxWarmPool` and `SandboxTemplate`
/// (suffixed). Must be a DNS-1123 label.
pub name: String,
/// Number of pods to keep warm.
pub replicas: i32,
/// `runtimeClassName` baked into the pool's `SandboxTemplate`. Empty uses
/// the driver's `default_runtime_class_name`.
#[serde(default)]
pub runtime_class_name: String,
/// Whether pooled pods request a GPU. GPU pools hold accelerators idle, so
/// they are opt-in per pool.
#[serde(default)]
pub gpu: bool,
/// Optional shared, read-only data volume mounted into every pooled pod
/// (datasets / models / caches). Safe to share precisely because it is
/// read-only and holds no per-agent state.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub shared_volume: Option<SharedVolumeSpec>,
}

/// A pre-existing PVC mounted read-only into pooled pods.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(deny_unknown_fields)]
pub struct SharedVolumeSpec {
/// Name of an existing `PersistentVolumeClaim` in the sandbox namespace.
/// Must support a read-many access mode (`ReadOnlyMany`/`ReadWriteMany`)
/// when pods land on multiple nodes.
pub claim_name: String,
/// Absolute path the volume is mounted at inside the sandbox.
pub mount_path: String,
/// Optional sub-path within the volume to mount.
#[serde(default, skip_serializing_if = "String::is_empty")]
pub sub_path: String,
}

/// Lower bound enforced by kubelet for projected SA tokens.
Expand Down Expand Up @@ -246,6 +308,7 @@ impl Default for KubernetesComputeConfig {
default_runtime_class_name: String::new(),
sa_token_ttl_secs: 3600,
provider_spiffe_workload_api_socket_path: String::new(),
warm_pool: KubernetesWarmPoolConfig::default(),
}
}
}
Expand Down Expand Up @@ -459,4 +522,56 @@ mod tests {
let cfg: KubernetesComputeConfig = serde_json::from_value(json).unwrap();
assert_eq!(cfg.image_pull_secrets, ["regcred", "backup-regcred"]);
}

#[test]
fn default_warm_pool_is_disabled() {
let cfg = KubernetesComputeConfig::default();
assert!(!cfg.warm_pool.enabled);
assert!(cfg.warm_pool.pools.is_empty());
}

#[test]
fn serde_override_warm_pool() {
let json = serde_json::json!({
"warm_pool": {
"enabled": true,
"pools": [
{ "name": "default", "replicas": 3 },
{
"name": "gpu",
"replicas": 1,
"gpu": true,
"runtime_class_name": "nvidia",
"shared_volume": {
"claim_name": "models",
"mount_path": "/models",
"sub_path": "llama"
}
}
]
}
});
let cfg: KubernetesComputeConfig = serde_json::from_value(json).unwrap();
assert!(cfg.warm_pool.enabled);
assert_eq!(cfg.warm_pool.pools.len(), 2);
assert_eq!(cfg.warm_pool.pools[0].name, "default");
assert_eq!(cfg.warm_pool.pools[0].replicas, 3);
assert!(!cfg.warm_pool.pools[0].gpu);
assert!(cfg.warm_pool.pools[0].shared_volume.is_none());
let gpu = &cfg.warm_pool.pools[1];
assert!(gpu.gpu);
assert_eq!(gpu.runtime_class_name, "nvidia");
let shared = gpu.shared_volume.as_ref().unwrap();
assert_eq!(shared.claim_name, "models");
assert_eq!(shared.mount_path, "/models");
assert_eq!(shared.sub_path, "llama");
}

#[test]
fn serde_rejects_unknown_warm_pool_field() {
let json = serde_json::json!({
"warm_pool": { "enabled": true, "bogus": 1 }
});
assert!(serde_json::from_value::<KubernetesComputeConfig>(json).is_err());
}
}
Loading
Loading