huggingface · icyxp · Jul 23, 2025 · Jul 29, 2025
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
@@ -152,6 +152,9 @@
     from text_generation_server.models.custom_modeling.flash_qwen2_modeling import (
         Qwen2ForCausalLM,
     )
+    from text_generation_server.models.custom_modeling.flash_qwen3_modeling import (
+        Qwen3ForCausalLM,
+    )
     from text_generation_server.models.custom_modeling.flash_mistral_modeling import (
         FlashMistralForCausalLM,
     )
@@ -348,6 +351,11 @@ class ModelType(enum.Enum):
         "name": "Qwen 2",
         "url": "https://huggingface.co/collections/Qwen/qwen2-6659360b33528ced941e557f",
     }
+    QWEN3 = {
+        "type": "qwen3",
+        "name": "Qwen 3",
+        "url": "https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f",
+    }
     QWEN2_VL = {
         "type": "qwen2_vl",
         "name": "Qwen 2 VL",
@@ -1470,6 +1478,40 @@ def get_model(
                 trust_remote_code=trust_remote_code,
             )
 
+    if model_type == QWEN3:
+        if FLASH_ATTENTION:
+            return FlashCausalLM(
+                model_id=model_id,
+                model_class=Qwen3ForCausalLM,
+                revision=revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                kv_cache_dtype=kv_cache_dtype,
+                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
+            )
+        elif FLASH_TRANSFORMERS_BACKEND:
+            return TransformersFlashCausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+        elif sharded:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Qwen3"))
+        else:
+            return CausalLM.fallback(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+
     if model_type == OPT:
         return CausalLM(
             model_id=model_id,