InternLM
diff --git a/‎benchmark/profile_throughput.py‎
Lines changed: 6 additions & 0 deletions b/‎benchmark/profile_throughput.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎docs/en/advance/pytorch_multinodes.md‎
Lines changed: 75 additions & 0 deletions b/‎docs/en/advance/pytorch_multinodes.md‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎docs/en/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/en/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/zh_cn/advance/pytorch_multinodes.md‎
Lines changed: 73 additions & 0 deletions b/‎docs/zh_cn/advance/pytorch_multinodes.md‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎docs/zh_cn/advance/pytorch_multithread.md‎
Lines changed: 1 addition & 24 deletions b/‎docs/zh_cn/advance/pytorch_multithread.md‎
Lines changed: 1 addition & 24 deletions
diff --git a/‎docs/zh_cn/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/zh_cn/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lmdeploy/messages.py‎
Lines changed: 3 additions & 0 deletions b/‎lmdeploy/messages.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎lmdeploy/pytorch/backends/base.py‎
Lines changed: 10 additions & 0 deletions b/‎lmdeploy/pytorch/backends/base.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎lmdeploy/pytorch/backends/cuda/awq_modules.py‎
Lines changed: 2 additions & 1 deletion b/‎lmdeploy/pytorch/backends/cuda/awq_modules.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎lmdeploy/pytorch/backends/cuda/blockedf8_modules.py‎
Lines changed: 1 addition & 2 deletions b/‎lmdeploy/pytorch/backends/cuda/blockedf8_modules.py‎
Lines changed: 1 addition & 2 deletions
@@ -190,6 +190,11 @@ def parse_args():
     parser.add_argument('--use-uvloop', action='store_true')
     parser.add_argument('--csv', type=str, help='Where to save the result.', default='./profile_throughput.csv')
     parser.add_argument('--seed', type=int, default=0, help='Seed used in sampling prompts from dataset')
+    parser.add_argument('--distributed-executor-backend',
+                        type=str,
+                        default=None,
+                        choices=['uni', 'mp', 'ray'],
+                        help='backend of executor backend')
     # other args
     ArgumentHelper.top_p(parser)
     ArgumentHelper.temperature(parser)
@@ -256,6 +261,7 @@ def main():
             enable_prefix_caching=args.enable_prefix_caching,
             quant_policy=args.quant_policy,
             dtype=args.dtype,
+            distributed_executor_backend=args.distributed_executor_backend,
         )
 
     if args.use_uvloop:
 
@@ -0,0 +1,75 @@
+# PyTorchEngine Multi-Node Deployment Guide
+
+To support larger-scale model deployment requirements, PyTorchEngine provides multi-node deployment support. Below are the detailed steps for deploying a `tp=16` model across two 8-GPU nodes.
+
+## 1. Create Docker Containers (Optional)
+
+To ensure consistency across the cluster environment, it is recommended to use Docker to set up the cluster. Create containers on each node as follows:
+
+```bash
+docker run -it \
+    --network host \
+    -v $MODEL_PATH:$CONTAINER_MODEL_PATH \
+    openmmlab/lmdeploy:latest
+```
+
+> \[!IMPORTANT\]
+> Ensure that the model is placed in the same directory on all node containers.
+
+## 2. Set Up the Cluster Using Ray
+
+### 2.1 Start the Head Node
+
+Select one node as the **head node** and run the following command in its container:
+
+```bash
+ray start --head --port=$DRIVER_PORT
+```
+
+### 2.2 Join the Cluster
+
+On the other nodes, use the following command in their containers to join the cluster created by the head node:
+
+```bash
+ray start --address=$DRIVER_NODE_ADDR:$DRIVER_PORT
+```
+
+run `ray status` on head node to check the cluster.
+
+> \[!IMPORTANT\]
+> Ensure that `DRIVER_NODE_ADDR` is the address of the head node and `DRIVER_PORT` matches the port number used during the head node initialization.
+
+## 3. Use LMDeploy Interfaces
+
+In the head node's container, you can use all functionalities of PyTorchEngine as usual.
+
+### 3.1 Start the Server
+
+```bash
+lmdeploy serve api_server \
+    $CONTAINER_MODEL_PATH \
+    --backend pytorch \
+    --tp 16
+```
+
+### 3.2 Use the Pipeline
+
+```python
+from lmdeploy import pipeline, PytorchEngineConfig
+
+if __name__ == '__main__':
+    model_path = '/path/to/model'
+    backend_config = PytorchEngineConfig(tp=16)
+    with pipeline(model_path, backend_config=backend_config) as pipe:
+        outputs = pipe('Hakuna Matata')
+```
+
+> \[!NOTE\]
+> PyTorchEngine will automatically choose the appropriate launch method (single-node/multi-node) based on the `tp` parameter and the number of devices available in the cluster. If you want to enforce the use of the Ray cluster, you can configure `distributed_executor_backend='ray'` in `PytorchEngineConfig` or use the environment variable `LMDEPLOY_EXECUTOR_BACKEND=ray`.
+
+______________________________________________________________________
+
+By following the steps above, you can successfully deploy PyTorchEngine in a multi-node environment and leverage the Ray cluster for distributed computing.
+
+> \[!WARNING\]
+> To achieve better performance, we recommend users to configure a higher-quality network environment (such as using [InfiniBand](https://en.wikipedia.org/wiki/InfiniBand)) to improve engine efficiency.
@@ -101,6 +101,7 @@ Documentation
    advance/chat_template.md
    advance/debug_turbomind.md
    advance/structed_output.md
+   advance/pytorch_multinodes.md
 
 .. toctree::
    :maxdepth: 1
 
@@ -0,0 +1,73 @@
+# PyTorchEngine 多节点部署指南
+
+为了支持更大规模的模型部署需求，PyTorchEngine 提供了多节点部署的支持。以下是如何在两个8卡节点上部署 tp=16 模型的详细步骤。
+
+## 1. 创建 Docker 容器（可选）
+
+为了确保集群环境的一致性，建议使用 Docker 搭建集群。在每个节点上创建容器：
+
+```bash
+docker run -it \
+    --network host \
+    -v $MODEL_PATH:$CONTAINER_MODEL_PATH \
+    openmmlab/lmdeploy:latest
+```
+
+> \[!IMPORTANT\]
+> 请确保将模型放置在各个节点容器的相同目录中。
+
+## 2. 使用 ray 搭建集群
+
+### 2.1 启动主节点
+
+选择其中一个节点做为`主节点`，并在该节点的容器中运行以下命令：
+
+```bash
+ray start --head --port=$DRIVER_PORT
+```
+
+### 2.2 加入集群
+
+在其他节点的容器中，使用以下命令加入主节点所在的集群：
+
+```bash
+ray start --address=$DRIVER_NODE_ADDR:$DRIVER_PORT
+```
+
+完成后可以在主节点使用 `ray status` 查看集群状态，确保所有节点都被成功加入集群。
+
+> \[!IMPORTANT\]
+> 请确保 `DRIVER_NODE_ADDR` 为主节点的地址，`DRIVER_PORT` 与主节点初始化时使用的端口号一致。
+
+## 3. 使用 LMDeploy 接口
+
+在主节点的容器中，您可以正常使用 PyTorchEngine 的所有功能。
+
+### 3.1 启动服务 API
+
+```bash
+lmdeploy serve api_server \
+    $CONTAINER_MODEL_PATH \
+    --backend pytorch \
+    --tp 16
+```
+
+### 3.2 使用 pipeline 接口
+
+```python
+from lmdeploy import pipeline, PytorchEngineConfig
+
+if __name__ == '__main__':
+    model_path = '/path/to/model'
+    backend_config = PytorchEngineConfig(tp=16)
+    with pipeline(model_path, backend_config=backend_config) as pipe:
+        outputs = pipe('Hakuna Matata')
+```
+
+> \[!NOTE\]
+> PytorchEngine 会根据 tp 数以及集群上的设备数量自动选择合适的启动方式（单机/多机）。如果希望强制使用 ray 集群，可以配置 `PytorchEngineConfig` 中的 `distributed_executor_backend='ray'` 或使用环境变量 `LMDEPLOY_EXECUTOR_BACKEND=ray`。
+
+通过以上步骤，您可以成功在多节点环境中部署 PyTorchEngine，并利用 Ray 集群进行分布式计算。
+
+> \[!WARNING\]
+> 为了能够得到更好的性能，我们建议用户配置更好的网络环境（比如使用 [InfiniBand](https://en.wikipedia.org/wiki/InfiniBand)）以提高引擎运行效率
@@ -1,29 +1,6 @@
 # PyTorchEngine 多线程推理
 
-自 [PR2907](https://github.com/InternLM/lmdeploy/pull/2907) 起，我们废除了 PytorchEngine 的 thread_safe 模式以保证引擎能够更高效的运行。我们鼓励用户尽可能使用**服务接口**或**协程**来实现高并发，比如：
-
-```python
-import asyncio
-from lmdeploy import pipeline, PytorchEngineConfig
-
-event_loop = asyncio.new_event_loop()
-asyncio.set_event_loop(event_loop)
-
-model_path = 'Llama-3.2-1B-Instruct'
-pipe = pipeline(model_path, backend_config=PytorchEngineConfig())
-
-async def _gather_output():
-    tasks = [
-        pipe.async_batch_infer('Hakuna Matata'),
-        pipe.async_batch_infer('giraffes are heartless creatures'),
-    ]
-    return await asyncio.gather(*tasks)
-
-output = asyncio.run(_gather_output())
-print(output[0].text)
-print(output[1].text)
-```
-
+自 [PR2907](https://github.com/InternLM/lmdeploy/pull/2907) 起，我们废除了 PytorchEngine 的 thread_safe 模式以保证引擎能够更高效的运行。我们鼓励用户尽可能使用**服务接口**或**协程**来实现高并发,
 如果你确实有多线程推理的需求，那么可以进行简单的封装，来实现类似的效果。
 
 ```python
 
@@ -102,6 +102,7 @@ LMDeploy 工具箱提供以下核心功能：
    advance/chat_template.md
    advance/debug_turbomind.md
    advance/structed_output.md
+   advance/pytorch_multinodes.md
 
 .. toctree::
    :maxdepth: 1
 
@@ -278,6 +278,8 @@ class PytorchEngineConfig:
             If unspecified, will use the default version.
         quant_policy (int): default to 0. When k/v is quantized into 4 or 8
             bit, set it to 4 or 8, respectively
+        distributed_executor_backend (str): backend of distributed backend,
+            options: ['uni', 'mp', 'ray']
     """
     dtype: str = 'auto'
     tp: int = 1
@@ -298,6 +300,7 @@ class PytorchEngineConfig:
     download_dir: str = None
     revision: str = None
     quant_policy: Literal[0, 4, 8] = 0
+    distributed_executor_backend: str = None
 
     def __post_init__(self):
         """Check input validation."""
 
@@ -90,3 +90,13 @@ def build_graph_runner(model: torch.nn.Module, model_config: ModelConfig, cache_
         """build graph runner."""
         from .graph_runner import GraphRunner
         return GraphRunner(model, model_config, cache_config, backend_config, device)
+
+    @staticmethod
+    def device_count():
+        """get num available devices."""
+        return None
+
+    @staticmethod
+    def support_ray():
+        """support ray."""
+        return False
@@ -2,7 +2,8 @@
 from typing import Optional
 
 import torch
-from torch import distributed as dist
+
+import lmdeploy.pytorch.distributed as dist
 
 from ..awq_modules import LinearW4A16Builder, LinearW4A16Impl
 
 
@@ -1,10 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-
 from typing import Optional
 
 import torch
-import torch.distributed as dist
 
+import lmdeploy.pytorch.distributed as dist
 from lmdeploy.pytorch.kernels.cuda.blocked_gemm_fp8 import blocked_gemm_fp8, quant_fp8
 
 from ..blockedf8_modules import LinearBlockedF8Builder, LinearBlockedF8Impl