Skip to content

[Bug]: Qwen3.5 does not work with pipeline parallelismΒ #36643

@VirgilG72

Description

@VirgilG72

Your current environment

The output of python collect_env.py
Your output of `python collect_env.py` here

πŸ› Describe the bug

vllm server bash

python3 -m vllm.entrypoints.openai.api_server --served-model-name Qwen3.5-35B-A3B --model /athena/Qwen3.5-35B-A3B --gpu-memory-utilization 0.9 --tensor-parallel-size 1 --pipeline-parallel-size 4 --moe-backend marlin --max-model-len 160000 --max-num-batched-tokens 4096 --max-num-seqs 32 --distributed-executor-backend ray --enable-log-requests --enable-log-outputs --enable-auto-tool-choice --tool-call-parser qwen3_coder --enable-prefix-caching --reasoning-parser qwen3 --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":2}'

error output

root@xuanwu-text-safety-qwen3-5-1355630-z8p8q:/data# nohup python3 -m vllm.entrypoints.openai.api_server --served-model-name Qwen3.5-35B-A3B --model /athena/Qwen3.5-35B-A3B --gpu-memory-utilization 0.9 --tensor-parallel-size 1 --pipeline-parallel-size 4 --moe-backend marlin --max-model-len 160000 --max-num-batched-tokens 4096 --max-num-seqs 32 --distributed-executor-backend ray --enable-log-requests --enable-log-outputs --enable-auto-tool-choice --tool-call-parser qwen3_coder --enable-prefix-caching --reasoning-parser qwen3 --speculative-config  '{"method":"qwen3_next_mtp","num_speculative_tokens":2}' > vllm.log 2>&1 &
[1] 443977
root@xuanwu-text-safety-qwen3-5-1355630-z8p8q:/data# tail -f vllm.log  
nohup: ignoring input
(APIServer pid=443977) INFO 03-10 19:20:41 [utils.py:302] 
(APIServer pid=443977) INFO 03-10 19:20:41 [utils.py:302]        β–ˆ     β–ˆ     β–ˆβ–„   β–„β–ˆ
(APIServer pid=443977) INFO 03-10 19:20:41 [utils.py:302]  β–„β–„ β–„β–ˆ β–ˆ     β–ˆ     β–ˆ β–€β–„β–€ β–ˆ  version 0.17.0
(APIServer pid=443977) INFO 03-10 19:20:41 [utils.py:302]   β–ˆβ–„β–ˆβ–€ β–ˆ     β–ˆ     β–ˆ     β–ˆ  model   /athena/Qwen3.5-35B-A3B
(APIServer pid=443977) INFO 03-10 19:20:41 [utils.py:302]    β–€β–€  β–€β–€β–€β–€β–€ β–€β–€β–€β–€β–€ β–€     β–€
(APIServer pid=443977) INFO 03-10 19:20:41 [utils.py:302] 
(APIServer pid=443977) INFO 03-10 19:20:41 [utils.py:238] non-default args: {'enable_auto_tool_choice': True, 'tool_call_parser': 'qwen3_coder', 'enable_log_outputs': True, 'model': '/athena/Qwen3.5-35B-A3B', 'max_model_len': 160000, 'served_model_name': ['Qwen3.5-35B-A3B'], 'reasoning_parser': 'qwen3', 'distributed_executor_backend': 'ray', 'pipeline_parallel_size': 4, 'enable_prefix_caching': True, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'moe_backend': 'marlin', 'speculative_config': {'method': 'qwen3_next_mtp', 'num_speculative_tokens': 2}, 'enable_log_requests': True}
(APIServer pid=443977) INFO 03-10 19:20:41 [model.py:531] Resolved architecture: Qwen3_5MoeForConditionalGeneration
(APIServer pid=443977) INFO 03-10 19:20:41 [model.py:1554] Using max model len 160000
(APIServer pid=443977) WARNING 03-10 19:20:41 [speculative.py:346] method `qwen3_next_mtp` is deprecated and replaced with mtp.
(APIServer pid=443977) INFO 03-10 19:20:41 [model.py:531] Resolved architecture: Qwen3_5MoeMTP
(APIServer pid=443977) INFO 03-10 19:20:41 [model.py:1554] Using max model len 262144
(APIServer pid=443977) WARNING 03-10 19:20:41 [speculative.py:487] Enabling num_speculative_tokens > 1 will run multiple times of forward on same MTP layer,which may result in lower acceptance rate
(APIServer pid=443977) Traceback (most recent call last):
(APIServer pid=443977)   File "<frozen runpy>", line 198, in _run_module_as_main
(APIServer pid=443977)   File "<frozen runpy>", line 88, in _run_code
(APIServer pid=443977)   File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 545, in <module>
(APIServer pid=443977)     uvloop.run(run_server(args))
(APIServer pid=443977)   File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run
(APIServer pid=443977)     return __asyncio.run(
(APIServer pid=443977)            ^^^^^^^^^^^^^^
(APIServer pid=443977)   File "/usr/lib/python3.12/asyncio/runners.py", line 194, in run
(APIServer pid=443977)     return runner.run(main)
(APIServer pid=443977)            ^^^^^^^^^^^^^^^^
(APIServer pid=443977)   File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
(APIServer pid=443977)     return self._loop.run_until_complete(task)
(APIServer pid=443977)            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=443977)   File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
(APIServer pid=443977)   File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper
(APIServer pid=443977)     return await main
(APIServer pid=443977)            ^^^^^^^^^^
(APIServer pid=443977)   File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 471, in run_server
(APIServer pid=443977)     await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
(APIServer pid=443977)   File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 490, in run_server_worker
(APIServer pid=443977)     async with build_async_engine_client(
(APIServer pid=443977)                ^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=443977)   File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
(APIServer pid=443977)     return await anext(self.gen)
(APIServer pid=443977)            ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=443977)   File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 96, in build_async_engine_client
(APIServer pid=443977)     async with build_async_engine_client_from_engine_args(
(APIServer pid=443977)                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=443977)   File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
(APIServer pid=443977)     return await anext(self.gen)
(APIServer pid=443977)            ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=443977)   File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 122, in build_async_engine_client_from_engine_args
(APIServer pid=443977)     vllm_config = engine_args.create_engine_config(usage_context=usage_context)
(APIServer pid=443977)                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=443977)   File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1729, in create_engine_config
(APIServer pid=443977)     speculative_config = self.create_speculative_config(
(APIServer pid=443977)                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=443977)   File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1443, in create_speculative_config
(APIServer pid=443977)     return SpeculativeConfig(**self.speculative_config)
(APIServer pid=443977)            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=443977)   File "/usr/local/lib/python3.12/dist-packages/pydantic/_internal/_dataclasses.py", line 121, in __init__
(APIServer pid=443977)     s.__pydantic_validator__.validate_python(ArgsKwargs(args, kwargs), self_instance=s)
(APIServer pid=443977)   File "/usr/local/lib/python3.12/dist-packages/vllm/config/speculative.py", line 764, in _verify_args
(APIServer pid=443977)     self.draft_model_config.verify_with_parallel_config(
(APIServer pid=443977)   File "/usr/local/lib/python3.12/dist-packages/vllm/config/model.py", line 1065, in verify_with_parallel_config
(APIServer pid=443977)     raise NotImplementedError(
(APIServer pid=443977) NotImplementedError: Pipeline parallelism is not supported for this model. Supported models implement the `SupportsPP` interface.

Before submitting a new issue...

  • Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the documentation page, which can answer lots of frequently asked questions.

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions