Specify ASCEND NPU for inference.

as12138 · as12138 · commit 8fe76b968ba5 · 2024-12-03T14:36:43.000+08:00
diff --git a/fastchat/serve/cli.py b/fastchat/serve/cli.py
@@ -13,6 +13,7 @@
 - Type "!!save <filename>" to save the conversation history to a json file.
 - Type "!!load <filename>" to load a conversation history from a json file.
 """
+
 import argparse
 import os
 import re
@@ -197,6 +198,14 @@ def main(args):
             )
         os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
         os.environ["XPU_VISIBLE_DEVICES"] = args.gpus
+        if len(args.gpus.split(",")) == 1:
+            try:
+                import torch_npu
+
+                torch.npu.set_device(int(args.gpus))
+                print(f"NPU is available, now model is running on npu:{args.gpus}")
+            except ModuleNotFoundError:
+                pass
     if args.enable_exllama:
         exllama_config = ExllamaConfig(
             max_seq_len=args.exllama_max_seq_len,
diff --git a/fastchat/serve/model_worker.py b/fastchat/serve/model_worker.py
@@ -1,6 +1,7 @@
 """
 A model worker that executes the model.
 """
+
 import argparse
 import base64
 import gc
@@ -351,6 +352,14 @@ def create_model_worker():
                 f"Larger --num-gpus ({args.num_gpus}) than --gpus {args.gpus}!"
             )
         os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
+        if len(args.gpus.split(",")) == 1:
+            try:
+                import torch_npu
+
+                torch.npu.set_device(int(args.gpus))
+                print(f"NPU is available, now model is running on npu:{args.gpus}")
+            except ModuleNotFoundError:
+                pass
 
     gptq_config = GptqConfig(
         ckpt=args.gptq_ckpt or args.model_path,
diff --git a/fastchat/serve/multi_model_worker.py b/fastchat/serve/multi_model_worker.py
@@ -11,6 +11,7 @@
 We recommend using this with multiple Peft models (with `peft` in the name)
 where all Peft models are trained on the exact same base model.
 """
+
 import argparse
 import asyncio
 import dataclasses
@@ -206,6 +207,14 @@ def create_multi_model_worker():
                 f"Larger --num-gpus ({args.num_gpus}) than --gpus {args.gpus}!"
             )
         os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
+        if len(args.gpus.split(",")) == 1:
+            try:
+                import torch_npu
+
+                torch.npu.set_device(int(args.gpus))
+                print(f"NPU is available, now model is running on npu:{args.gpus}")
+            except ModuleNotFoundError:
+                pass
 
     gptq_config = GptqConfig(
         ckpt=args.gptq_ckpt or args.model_path,