Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions .ci/scripts/export_model_artifact.sh
Original file line number Diff line number Diff line change
Expand Up @@ -406,8 +406,9 @@ if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then

# Download prequantized model outside OUTPUT_DIR to avoid uploading on failure
LOCAL_MODEL_DIR=$(mktemp -d)
INDUCTOR_CACHE=$(mktemp -d)
trap 'rm -rf "$LOCAL_MODEL_DIR" "$INDUCTOR_CACHE"' EXIT
INDUCTOR_CACHE=$(mktemp -d "${RUNNER_TEMP:-/tmp}/inductor_cache_XXXXXX")
INDUCTOR_TMPDIR=$(mktemp -d "${RUNNER_TEMP:-/tmp}/tmpdir_XXXXXX")
trap 'rm -rf "$LOCAL_MODEL_DIR" "$INDUCTOR_CACHE" "$INDUCTOR_TMPDIR"' EXIT

python -c "from huggingface_hub import snapshot_download; snapshot_download('${HF_MODEL}', local_dir='${LOCAL_MODEL_DIR}')"

Expand All @@ -427,6 +428,7 @@ if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
# Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues)
echo "::group::Export"
EXPORT_LOG=$(mktemp)
TMPDIR="$INDUCTOR_TMPDIR" \
TORCHINDUCTOR_CACHE_DIR="$INDUCTOR_CACHE" \
python -m executorch.examples.models.qwen3_5_moe.export \
--prequantized "$LOCAL_MODEL_DIR" \
Expand Down Expand Up @@ -473,8 +475,9 @@ if [ "$MODEL_NAME" = "gemma4_31b" ]; then

# Download prequantized model outside OUTPUT_DIR to avoid uploading on failure
LOCAL_MODEL_DIR=$(mktemp -d)
INDUCTOR_CACHE=$(mktemp -d)
trap 'rm -rf "$LOCAL_MODEL_DIR" "$INDUCTOR_CACHE"' EXIT
INDUCTOR_CACHE=$(mktemp -d "${RUNNER_TEMP:-/tmp}/inductor_cache_XXXXXX")
INDUCTOR_TMPDIR=$(mktemp -d "${RUNNER_TEMP:-/tmp}/tmpdir_XXXXXX")
trap 'rm -rf "$LOCAL_MODEL_DIR" "$INDUCTOR_CACHE" "$INDUCTOR_TMPDIR"' EXIT

python -c "from huggingface_hub import snapshot_download; snapshot_download('${HF_MODEL}', local_dir='${LOCAL_MODEL_DIR}')"

Expand All @@ -498,6 +501,7 @@ if [ "$MODEL_NAME" = "gemma4_31b" ]; then

# Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues)
echo "::group::Export"
TMPDIR="$INDUCTOR_TMPDIR" \
TORCHINDUCTOR_CACHE_DIR="$INDUCTOR_CACHE" \
python -m executorch.examples.models.gemma4_31b.export \
--prequantized "$LOCAL_MODEL_DIR" \
Expand Down
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -433,11 +433,12 @@ voxtral_tts-cuda:
qwen3_5_moe-cuda:
@echo "==> Building and installing ExecuTorch with CUDA..."
cmake --workflow --preset llm-release-cuda
@echo "==> Building Qwen3.5 MoE runner with CUDA..."
@echo "==> Building Qwen3.5 MoE runner and no-bleed test with CUDA..."
cd examples/models/qwen3_5_moe && cmake --workflow --preset qwen3-5-moe-cuda
@echo ""
@echo "✓ Build complete!"
@echo " Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
@echo " Test: cmake-out/examples/models/qwen3_5_moe/test_qwen35_moe_nobleed"

gemma4_31b-cuda:
@echo "==> Building and installing ExecuTorch with CUDA..."
Expand Down
20 changes: 18 additions & 2 deletions examples/models/qwen3_5_moe/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)

set(_common_include_directories ${EXECUTORCH_ROOT}/..)
set(_json_include
${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/json/single_include
)

# gflags
set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
Expand Down Expand Up @@ -60,13 +63,26 @@ endif()
# Tokenizer
list(APPEND link_libraries tokenizers::tokenizers)

add_executable(qwen3_5_moe_runner main.cpp)
add_executable(qwen3_5_moe_runner main.cpp qwen35_moe_engine.cpp)
target_include_directories(
qwen3_5_moe_runner PUBLIC ${_common_include_directories}
qwen3_5_moe_runner PUBLIC ${_common_include_directories} ${_json_include}
)
target_link_libraries(qwen3_5_moe_runner PUBLIC ${link_libraries})

if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
target_link_options_gc_sections(qwen3_5_moe_runner)
target_link_options(qwen3_5_moe_runner PRIVATE "LINKER:-s")
endif()

if(EXECUTORCH_BUILD_CUDA)
enable_testing()
add_executable(
test_qwen35_moe_nobleed test_qwen35_moe_nobleed.cpp qwen35_moe_engine.cpp
)
target_include_directories(
test_qwen35_moe_nobleed PUBLIC ${_common_include_directories}
${_json_include}
)
target_link_libraries(test_qwen35_moe_nobleed PUBLIC ${link_libraries})
add_test(NAME qwen_nobleed COMMAND test_qwen35_moe_nobleed)
endif()
4 changes: 2 additions & 2 deletions examples/models/qwen3_5_moe/CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@
"buildPresets": [
{
"name": "qwen3-5-moe-cuda",
"displayName": "Build Qwen3.5 MoE runner (CUDA)",
"displayName": "Build Qwen3.5 MoE runner + no-bleed test (CUDA)",
"configurePreset": "qwen3-5-moe-cuda",
"targets": ["qwen3_5_moe_runner"]
"targets": ["qwen3_5_moe_runner", "test_qwen35_moe_nobleed"]
},
{
"name": "qwen3-5-moe-metal",
Expand Down
26 changes: 25 additions & 1 deletion examples/models/qwen3_5_moe/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ It can be uploaded to HuggingFace Hub for easy sharing.

ExecuTorch must be installed from source first (see
[Prerequisites](#prerequisites)). The `make` target handles building
core libraries and the runner binary.
core libraries, the runner binary, and the CUDA no-bleed test binary.

```bash
make qwen3_5_moe-cuda
Expand All @@ -109,6 +109,10 @@ make qwen3_5_moe-cuda
This builds ExecuTorch with CUDA backend support, then the runner binary
at `cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner`.

The runner is a thin CLI over `Qwen35MoEEngine` and `Qwen35MoESession`.
On CUDA, the engine loads the model weights once and can create multiple
isolated sessions by rebinding the model's mutable buffers before execution.
Comment thread
mergennachin marked this conversation as resolved.

## Run

The runner requires:
Expand All @@ -133,8 +137,28 @@ cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner \
| `--data_path` | (none) | Path to `.ptd` delegate data file (required for CUDA) |
| `--tokenizer_path` | (required) | Path to HuggingFace `tokenizer.json` |
| `--prompt` | `"Hello"` | Input prompt text |
| `--prompt_file` | (none) | Path to a prompt file (overrides `--prompt`) |
| `--temperature` | `0.8` | Sampling temperature (0 = greedy) |
| `--max_new_tokens` | `128` | Maximum tokens to generate |
| `--warmup` | `0` | Warmup iterations to discard before timing |
| `--num_iters` | `1` | Timed iterations to average after warmup |
| `--cuda_graph` | `false` | CUDA-only decode graph capture for single-session runner use |

`--cuda_graph` is intentionally single-session only. CUDA graph replay captures

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should find some way to support cuda graph in multiple session setting. One idea is promoting the cuda graph configs into llm sessions and whenever we change to a new session we should recaptured the graph.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will be a follow-up issue. Create an #20310

device pointers, so it is not combined with per-session mutable-state rebinding.

### CUDA no-bleed test

The CUDA build also produces `test_qwen35_moe_nobleed`, which validates that two
sessions can interleave prefill/decode on one loaded model without sharing
mutable state:

```bash
QWEN_MODEL_PATH=qwen35_moe_exports/model.pte \
QWEN_DATA_PATH=qwen35_moe_exports/aoti_cuda_blob.ptd \
QWEN_TOKENIZER_PATH=~/models/Qwen3.5-35B-A3B/tokenizer.json \
cmake-out/examples/models/qwen3_5_moe/test_qwen35_moe_nobleed
```

## Troubleshooting

Expand Down
61 changes: 50 additions & 11 deletions examples/models/qwen3_5_moe/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -624,9 +624,8 @@ def _materialize_buffers(model, config):
Replaces meta buffers with real tensors on CPU, recomputes RoPE
inv_freq and causal masks. State buffers (KV cache, conv/recurrent
state) are zero-initialized registered buffers. On the CUDA/AOTI backend
they are lifted into the delegate as constants and shared across methods at
runtime via the backend's per-FQN buffer cache; backends that keep them at
the graph level instead share them via share_mutable_buffers.
they are lifted into the delegate as named constants; per-session sharing
and isolation are handled by runtime rebinding.
"""
# Masks stay bool, inv_freq stays float32.
for fqn, buf in list(model.named_buffers()):
Expand Down Expand Up @@ -915,6 +914,47 @@ def _export_metal(model, config, args):
print("Done!")


def _qwen_mutable_buffer_fqns(model):
from executorch.examples.models.qwen3_5_moe.model import GatedDeltaNet, KVCache

fqns = []
for prefix, module in model.named_modules():
if module.__class__.__name__ == "TurboQuantKVCache":
fqns += [
f"{prefix}.k_packed",
f"{prefix}.k_norms",
f"{prefix}.v_packed",
f"{prefix}.v_norms",
]
elif isinstance(module, KVCache):
fqns += [f"{prefix}.k_cache", f"{prefix}.v_cache"]
elif isinstance(module, GatedDeltaNet):
fqns += [f"{prefix}.conv_state", f"{prefix}.recurrent_state"]

named = dict(model.named_buffers())
missing = [f for f in fqns if f not in named]
if missing:
raise RuntimeError(
f"Qwen mutable-buffer contract references missing buffers: {missing}"
)
if not fqns:
raise RuntimeError("Qwen mutable-buffer contract is empty")
return sorted(fqns)


def _mutable_buffer_metadata_json(model):
import json

fqns = _qwen_mutable_buffer_fqns(model)
named = dict(model.named_buffers())
total = sum(named[f].numel() * named[f].element_size() for f in fqns)
print(
f" Recorded {len(fqns)} mutable buffers "
f"({total} B / {total / 1024:.1f} KiB per session)"
)
return json.dumps({"version": 1, "mutable_buffers": fqns})


def _export_cuda(model, config, args):
"""Export model to .pte via torch.export + CUDA backend.

Expand All @@ -923,13 +963,9 @@ def _export_cuda(model, config, args):
- "prefill": prefill path (T>=2), batched tensor-core MoE kernel
via fused_moe_batched_gemm, with dynamic sequence length.

Both methods share mutable state buffers (KV cache, conv_state,
recurrent_state): the model uses registered buffers with in-place
updates (no state in/out args). On the CUDA/AOTI backend these buffers
are lifted into the delegate as constants and shared across the
decode/prefill methods at runtime via the backend's per-FQN buffer cache
(share_mutable_buffers is left off for CUDA); backends that keep them at
the graph level instead share them via share_mutable_buffers.
The model uses registered buffers with in-place updates for KV,
conv_state, and recurrent_state. The export records which named buffers
are per-session mutable state.
"""
import torch._inductor.config as inductor_config

Expand Down Expand Up @@ -1006,6 +1042,7 @@ def _export_cuda(model, config, args):
"use_kv_cache": True,
"use_sdpa_with_kv_cache": False,
"enable_dynamic_shape": True,
"get_mutable_buffer_metadata": _mutable_buffer_metadata_json(model),
}
et_prog = to_edge_transform_and_lower(
{"decode": decode_ep, "prefill": prefill_ep},
Expand Down Expand Up @@ -1037,7 +1074,9 @@ def _export_cuda(model, config, args):
config=ExecutorchBackendConfig(
extract_delegate_segments=True,
do_quant_fusion_and_const_prop=True,
memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
memory_planning_pass=MemoryPlanningPass(
alloc_graph_input=False,
),
emit_mutable_buffer_names=True,
),
)
Expand Down
Loading
Loading