From c1bdd95cc9edf7620e72678d05c0ee624d282f04 Mon Sep 17 00:00:00 2001 From: Ruodi Lu Date: Wed, 27 May 2026 03:36:20 +0000 Subject: [PATCH] [https://nvbugs/6193836][test] Use EP=8 + attention DP for minimax_m2.5 8-GPU perf MiniMax-M2.5 FP8 has `intermediate_size=1536` and `weight_block_size=128`. TRT-LLM-gen / CUTLASS / DeepGEMM FP8 MoE kernels require the per-rank intermediate size to be a multiple of the block size 128. Under TP=8 each rank gets 1536/8=192, which fails the assert. Per developer guidance, route MoE through EP=8 and rely on attention DP instead of TP. Changes: - llm_perf_core.yml: switch the 7 minimax_m2.5_fp8 8-GPU test names from `tp:8-gpus:8` to `ep:8-gpus:8`. - pytorch_model_config.py: add a pattern matching exactly those 7 cases and enable `attention_dp: True` in the generated trtllm-bench config. The 4-GPU tests (TP=4 -> 1536/4=384) are unaffected and not touched. Fixes: NVBugs 6193836. Signed-off-by: Ruodi Lu --- .../defs/perf/pytorch_model_config.py | 17 +++++++++++++++++ .../integration/test_lists/qa/llm_perf_core.yml | 16 ++++++++-------- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py index 9ca366e03b3c..156d69e7535e 100644 --- a/tests/integration/defs/perf/pytorch_model_config.py +++ b/tests/integration/defs/perf/pytorch_model_config.py @@ -251,6 +251,23 @@ def get_model_yaml_config(model_label: str, 'enable_attention_dp': True, } }, + # MiniMax-M2.5 FP8 on 8 GPUs: intermediate_size=1536 with weight_block_size=128 + # is not divisible under TP=8 (1536/8=192), so route MoE through EP=8 and use + # attention DP instead of TP. + { + 'patterns': [ + 'minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:128,128-ep:8-gpus:8', + 'minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:500,2000-ep:8-gpus:8', + 'minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:2000,500-ep:8-gpus:8', + 'minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:1000,1000-ep:8-gpus:8', + 'minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:1000,2000-ep:8-gpus:8', + 'minimax_m2.5_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:8-gpus:8', + 'minimax_m2.5_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-con:512-ep:8-gpus:8', + ], + 'config': { + 'enable_attention_dp': True, + } + }, { 'patterns': [ 'qwen3_4b-bench-pytorch-streaming-bfloat16-maxbs:4-kv_frac:0.6-input_output_len:500,100-reqs:200-con:4', diff --git a/tests/integration/test_lists/qa/llm_perf_core.yml b/tests/integration/test_lists/qa/llm_perf_core.yml index 05d63f11d4e3..ffb34a1578ea 100644 --- a/tests/integration/test_lists/qa/llm_perf_core.yml +++ b/tests/integration/test_lists/qa/llm_perf_core.yml @@ -321,14 +321,14 @@ llm_perf_core: #llama_v3.3_70b_instruct_fp8 #pytorch backend - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:1000-tp:8-gpus:8] TIMEOUT(120) - #minimax_m2.5 (FP8 216G, 8-GPU) - - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:128,128-tp:8-gpus:8] - - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:500,2000-tp:8-gpus:8] - - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:2000,500-tp:8-gpus:8] - - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:1000,1000-tp:8-gpus:8] - - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:1000,2000-tp:8-gpus:8] - - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-tp:8-gpus:8] #min_latency - - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-con:512-tp:8-gpus:8] #max_throughput + #minimax_m2.5 (FP8 216G, 8-GPU) - use EP=8 + attention DP; TP=8 would split intermediate_size 1536/8=192 (not divisible by 128 block size) + - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:128,128-ep:8-gpus:8] + - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:500,2000-ep:8-gpus:8] + - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:2000,500-ep:8-gpus:8] + - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:1000,1000-ep:8-gpus:8] + - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-input_output_len:1000,2000-ep:8-gpus:8] + - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,1000-reqs:10-con:1-ep:8-gpus:8] #min_latency + - perf/test_perf.py::test_perf[minimax_m2.5_fp8-bench-pytorch-float8-maxbs:512-input_output_len:1000,1000-con:512-ep:8-gpus:8] #max_throughput #llama_v3.1_nemotron_ultra_253b (nemotron-nas BF16 474G, 8-GPU) - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:128,128-tp:8-gpus:8] - perf/test_perf.py::test_perf[llama_v3.1_nemotron_ultra_253b-bench-pytorch-bfloat16-input_output_len:500,2000-tp:8-gpus:8]