pytorch · mergennachin · Jun 16, 2026 · Jun 15, 2026 · Gasoonjia · Jun 16, 2026
diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
@@ -406,8 +406,9 @@ if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
 
   # Download prequantized model outside OUTPUT_DIR to avoid uploading on failure
   LOCAL_MODEL_DIR=$(mktemp -d)
-  INDUCTOR_CACHE=$(mktemp -d)
-  trap 'rm -rf "$LOCAL_MODEL_DIR" "$INDUCTOR_CACHE"' EXIT
+  INDUCTOR_CACHE=$(mktemp -d "${RUNNER_TEMP:-/tmp}/inductor_cache_XXXXXX")
+  INDUCTOR_TMPDIR=$(mktemp -d "${RUNNER_TEMP:-/tmp}/tmpdir_XXXXXX")
+  trap 'rm -rf "$LOCAL_MODEL_DIR" "$INDUCTOR_CACHE" "$INDUCTOR_TMPDIR"' EXIT
 
   python -c "from huggingface_hub import snapshot_download; snapshot_download('${HF_MODEL}', local_dir='${LOCAL_MODEL_DIR}')"
 
@@ -427,6 +428,7 @@ if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
   # Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues)
   echo "::group::Export"
   EXPORT_LOG=$(mktemp)
+  TMPDIR="$INDUCTOR_TMPDIR" \
   TORCHINDUCTOR_CACHE_DIR="$INDUCTOR_CACHE" \
   python -m executorch.examples.models.qwen3_5_moe.export \
       --prequantized "$LOCAL_MODEL_DIR" \
@@ -473,8 +475,9 @@ if [ "$MODEL_NAME" = "gemma4_31b" ]; then
 
   # Download prequantized model outside OUTPUT_DIR to avoid uploading on failure
   LOCAL_MODEL_DIR=$(mktemp -d)
-  INDUCTOR_CACHE=$(mktemp -d)
-  trap 'rm -rf "$LOCAL_MODEL_DIR" "$INDUCTOR_CACHE"' EXIT
+  INDUCTOR_CACHE=$(mktemp -d "${RUNNER_TEMP:-/tmp}/inductor_cache_XXXXXX")
+  INDUCTOR_TMPDIR=$(mktemp -d "${RUNNER_TEMP:-/tmp}/tmpdir_XXXXXX")
+  trap 'rm -rf "$LOCAL_MODEL_DIR" "$INDUCTOR_CACHE" "$INDUCTOR_TMPDIR"' EXIT
 
   python -c "from huggingface_hub import snapshot_download; snapshot_download('${HF_MODEL}', local_dir='${LOCAL_MODEL_DIR}')"
 
@@ -498,6 +501,7 @@ if [ "$MODEL_NAME" = "gemma4_31b" ]; then
 
   # Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues)
   echo "::group::Export"
+  TMPDIR="$INDUCTOR_TMPDIR" \
   TORCHINDUCTOR_CACHE_DIR="$INDUCTOR_CACHE" \
   python -m executorch.examples.models.gemma4_31b.export \
       --prequantized "$LOCAL_MODEL_DIR" \

diff --git a/Makefile b/Makefile
@@ -433,11 +433,12 @@ voxtral_tts-cuda:
 qwen3_5_moe-cuda:
 	@echo "==> Building and installing ExecuTorch with CUDA..."
 	cmake --workflow --preset llm-release-cuda
-	@echo "==> Building Qwen3.5 MoE runner with CUDA..."
+	@echo "==> Building Qwen3.5 MoE runner and no-bleed test with CUDA..."
 	cd examples/models/qwen3_5_moe && cmake --workflow --preset qwen3-5-moe-cuda
 	@echo ""
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
+	@echo "  Test:   cmake-out/examples/models/qwen3_5_moe/test_qwen35_moe_nobleed"
 
 gemma4_31b-cuda:
 	@echo "==> Building and installing ExecuTorch with CUDA..."

@@ -15,6 +15,9 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+set(_json_include
+    ${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/json/single_include
+)
 
 # gflags
 set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
@@ -60,13 +63,26 @@ endif()
 # Tokenizer
 list(APPEND link_libraries tokenizers::tokenizers)
 
-add_executable(qwen3_5_moe_runner main.cpp)
+add_executable(qwen3_5_moe_runner main.cpp qwen35_moe_engine.cpp)
 target_include_directories(
-  qwen3_5_moe_runner PUBLIC ${_common_include_directories}
+  qwen3_5_moe_runner PUBLIC ${_common_include_directories} ${_json_include}
 )
 target_link_libraries(qwen3_5_moe_runner PUBLIC ${link_libraries})
 
 if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
   target_link_options_gc_sections(qwen3_5_moe_runner)
   target_link_options(qwen3_5_moe_runner PRIVATE "LINKER:-s")
 endif()
+
+if(EXECUTORCH_BUILD_CUDA)
+  enable_testing()
+  add_executable(
+    test_qwen35_moe_nobleed test_qwen35_moe_nobleed.cpp qwen35_moe_engine.cpp
+  )
+  target_include_directories(
+    test_qwen35_moe_nobleed PUBLIC ${_common_include_directories}
+                                   ${_json_include}
+  )
+  target_link_libraries(test_qwen35_moe_nobleed PUBLIC ${link_libraries})
+  add_test(NAME qwen_nobleed COMMAND test_qwen35_moe_nobleed)
+endif()
@@ -41,9 +41,9 @@
     "buildPresets": [
         {
             "name": "qwen3-5-moe-cuda",
-            "displayName": "Build Qwen3.5 MoE runner (CUDA)",
+            "displayName": "Build Qwen3.5 MoE runner + no-bleed test (CUDA)",
             "configurePreset": "qwen3-5-moe-cuda",
-            "targets": ["qwen3_5_moe_runner"]
+            "targets": ["qwen3_5_moe_runner", "test_qwen35_moe_nobleed"]
         },
         {
             "name": "qwen3-5-moe-metal",

diff --git a/examples/models/qwen3_5_moe/README.md b/examples/models/qwen3_5_moe/README.md
@@ -100,7 +100,7 @@ It can be uploaded to HuggingFace Hub for easy sharing.
 
 ExecuTorch must be installed from source first (see
 [Prerequisites](#prerequisites)). The `make` target handles building
-core libraries and the runner binary.
+core libraries, the runner binary, and the CUDA no-bleed test binary.
 
 ```bash
 make qwen3_5_moe-cuda
@@ -109,6 +109,10 @@ make qwen3_5_moe-cuda
 This builds ExecuTorch with CUDA backend support, then the runner binary
 at `cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner`.
 
+The runner is a thin CLI over `Qwen35MoEEngine` and `Qwen35MoESession`.
+On CUDA, the engine loads the model weights once and can create multiple
+isolated sessions by rebinding the model's mutable buffers before execution.
+
 ## Run
 
 The runner requires:
@@ -133,8 +137,28 @@ cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner \
 | `--data_path` | (none) | Path to `.ptd` delegate data file (required for CUDA) |
 | `--tokenizer_path` | (required) | Path to HuggingFace `tokenizer.json` |
 | `--prompt` | `"Hello"` | Input prompt text |
+| `--prompt_file` | (none) | Path to a prompt file (overrides `--prompt`) |
 | `--temperature` | `0.8` | Sampling temperature (0 = greedy) |
 | `--max_new_tokens` | `128` | Maximum tokens to generate |
+| `--warmup` | `0` | Warmup iterations to discard before timing |
+| `--num_iters` | `1` | Timed iterations to average after warmup |
+| `--cuda_graph` | `false` | CUDA-only decode graph capture for single-session runner use |
+
+`--cuda_graph` is intentionally single-session only. CUDA graph replay captures
+device pointers, so it is not combined with per-session mutable-state rebinding.
+
+### CUDA no-bleed test
+
+The CUDA build also produces `test_qwen35_moe_nobleed`, which validates that two
+sessions can interleave prefill/decode on one loaded model without sharing
+mutable state:
+
+```bash
+QWEN_MODEL_PATH=qwen35_moe_exports/model.pte \
+QWEN_DATA_PATH=qwen35_moe_exports/aoti_cuda_blob.ptd \
+QWEN_TOKENIZER_PATH=~/models/Qwen3.5-35B-A3B/tokenizer.json \
+  cmake-out/examples/models/qwen3_5_moe/test_qwen35_moe_nobleed
+```
 
 ## Troubleshooting
 

diff --git a/examples/models/qwen3_5_moe/export.py b/examples/models/qwen3_5_moe/export.py
@@ -624,9 +624,8 @@ def _materialize_buffers(model, config):
     Replaces meta buffers with real tensors on CPU, recomputes RoPE
     inv_freq and causal masks. State buffers (KV cache, conv/recurrent
     state) are zero-initialized registered buffers. On the CUDA/AOTI backend
-    they are lifted into the delegate as constants and shared across methods at
-    runtime via the backend's per-FQN buffer cache; backends that keep them at
-    the graph level instead share them via share_mutable_buffers.
+    they are lifted into the delegate as named constants; per-session sharing
+    and isolation are handled by runtime rebinding.
     """
     # Masks stay bool, inv_freq stays float32.
     for fqn, buf in list(model.named_buffers()):
@@ -915,6 +914,47 @@ def _export_metal(model, config, args):
     print("Done!")
 
 
+def _qwen_mutable_buffer_fqns(model):
+    from executorch.examples.models.qwen3_5_moe.model import GatedDeltaNet, KVCache
+
+    fqns = []
+    for prefix, module in model.named_modules():
+        if module.__class__.__name__ == "TurboQuantKVCache":
+            fqns += [
+                f"{prefix}.k_packed",
+                f"{prefix}.k_norms",
+                f"{prefix}.v_packed",
+                f"{prefix}.v_norms",
+            ]
+        elif isinstance(module, KVCache):
+            fqns += [f"{prefix}.k_cache", f"{prefix}.v_cache"]
+        elif isinstance(module, GatedDeltaNet):
+            fqns += [f"{prefix}.conv_state", f"{prefix}.recurrent_state"]
+
+    named = dict(model.named_buffers())
+    missing = [f for f in fqns if f not in named]
+    if missing:
+        raise RuntimeError(
+            f"Qwen mutable-buffer contract references missing buffers: {missing}"
+        )
+    if not fqns:
+        raise RuntimeError("Qwen mutable-buffer contract is empty")
+    return sorted(fqns)
+
+
+def _mutable_buffer_metadata_json(model):
+    import json
+
+    fqns = _qwen_mutable_buffer_fqns(model)
+    named = dict(model.named_buffers())
+    total = sum(named[f].numel() * named[f].element_size() for f in fqns)
+    print(
+        f"  Recorded {len(fqns)} mutable buffers "
+        f"({total} B / {total / 1024:.1f} KiB per session)"
+    )
+    return json.dumps({"version": 1, "mutable_buffers": fqns})
+
+
 def _export_cuda(model, config, args):
     """Export model to .pte via torch.export + CUDA backend.
 
@@ -923,13 +963,9 @@ def _export_cuda(model, config, args):
       - "prefill": prefill path (T>=2), batched tensor-core MoE kernel
         via fused_moe_batched_gemm, with dynamic sequence length.
 
-    Both methods share mutable state buffers (KV cache, conv_state,
-    recurrent_state): the model uses registered buffers with in-place
-    updates (no state in/out args). On the CUDA/AOTI backend these buffers
-    are lifted into the delegate as constants and shared across the
-    decode/prefill methods at runtime via the backend's per-FQN buffer cache
-    (share_mutable_buffers is left off for CUDA); backends that keep them at
-    the graph level instead share them via share_mutable_buffers.
+    The model uses registered buffers with in-place updates for KV,
+    conv_state, and recurrent_state. The export records which named buffers
+    are per-session mutable state.
     """
     import torch._inductor.config as inductor_config
 
@@ -1006,6 +1042,7 @@ def _export_cuda(model, config, args):
         "use_kv_cache": True,
         "use_sdpa_with_kv_cache": False,
         "enable_dynamic_shape": True,
+        "get_mutable_buffer_metadata": _mutable_buffer_metadata_json(model),
     }
     et_prog = to_edge_transform_and_lower(
         {"decode": decode_ep, "prefill": prefill_ep},
@@ -1037,7 +1074,9 @@ def _export_cuda(model, config, args):
         config=ExecutorchBackendConfig(
             extract_delegate_segments=True,
             do_quant_fusion_and_const_prop=True,
-            memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
+            memory_planning_pass=MemoryPlanningPass(
+                alloc_graph_input=False,
+            ),
             emit_mutable_buffer_names=True,
         ),
     )