vllm-project
diff --git a/‎tests/e2e/singlecard/ops/test_fused_moe.py‎
Lines changed: 8 additions & 7 deletions b/‎tests/e2e/singlecard/ops/test_fused_moe.py‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎tests/ut/models/conftest.py‎
Lines changed: 3 additions & 3 deletions b/‎tests/ut/models/conftest.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎tests/ut/ops/test_comm_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/ut/ops/test_comm_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/ut/ops/test_common_fused_moe.py‎
Lines changed: 0 additions & 56 deletions b/‎tests/ut/ops/test_common_fused_moe.py‎
Lines changed: 0 additions & 56 deletions
diff --git a/‎tests/ut/ops/test_fused_ops.py‎ renamed to ‎tests/ut/ops/test_fused_moe.py‎
Lines changed: 65 additions & 26 deletions b/‎tests/ut/ops/test_fused_ops.py‎ renamed to ‎tests/ut/ops/test_fused_moe.py‎
Lines changed: 65 additions & 26 deletions
diff --git a/‎tests/ut/ops/test_moe_comm_method.py‎
Lines changed: 26 additions & 20 deletions b/‎tests/ut/ops/test_moe_comm_method.py‎
Lines changed: 26 additions & 20 deletions
@@ -28,9 +28,10 @@
 import torch_npu
 from vllm.model_executor.layers.activation import SiluAndMul
 
-from vllm_ascend.ops.moe.experts_selector import select_experts
-from vllm_ascend.ops.moe.moe_mlp import unified_apply_mlp
-from vllm_ascend.ops.moe.token_dispatcher import TokenDispatcherWithAllGather
+from vllm_ascend.ops.fused_moe.experts_selector import select_experts
+from vllm_ascend.ops.fused_moe.moe_mlp import unified_apply_mlp
+from vllm_ascend.ops.fused_moe.token_dispatcher import \
+    TokenDispatcherWithAllGather
 
 NUM_EXPERTS = [8, 64]
 EP_SIZE = [1]
@@ -182,7 +183,7 @@ def test_token_dispatcher_with_all_gather_quant(
 ):
     context_mock = MagicMock()
     context_mock.fused_moe_state = 0
-    with patch("vllm_ascend.ops.moe.moe_mlp.get_forward_context",
+    with patch("vllm_ascend.ops.fused_moe.moe_mlp.get_forward_context",
                return_value=context_mock):
         a = torch.randn((m, k), device=device, dtype=dtype) / 10
         w1 = torch.randn((e, k, 2 * n), device=device, dtype=torch.int8)
@@ -282,9 +283,9 @@ def test_select_experts(
                                  dtype=torch.int32)
         custom_routing_function.return_value = (mock_weights, mock_ids)
 
-    with patch("vllm_ascend.ops.moe.experts_selector._native_grouped_topk"
+    with patch("vllm_ascend.ops.fused_moe.experts_selector._native_grouped_topk"
                ) as mock_native_grouped_topk, \
-            patch('vllm_ascend.ops.moe.experts_selector.get_forward_context',
+            patch('vllm_ascend.ops.fused_moe.experts_selector.get_forward_context',
                   return_value=MagicMock(weight_prefetch_method=MagicMock())):
         mock_native_grouped_topk.side_effect = lambda x, num_groups, k: torch.randn_like(
             x)
@@ -318,7 +319,7 @@ def test_select_experts(
 
 @pytest.mark.parametrize("device", DEVICE)
 def test_select_experts_invalid_scoring_func(device: str):
-    with patch('vllm_ascend.ops.moe.experts_selector.get_forward_context',
+    with patch('vllm_ascend.ops.fused_moe.experts_selector.get_forward_context',
                   return_value=MagicMock(weight_prefetch_method=MagicMock())), \
             pytest.raises(ValueError,
                        match="Unsupported scoring function: invalid"):
 
@@ -90,9 +90,9 @@ def mock_distributed():
     mock_vllm_config.scheduler_config = Mock(max_num_seqs=256)
     mock_vllm_config.model_config = Mock(max_model_len=2048, quant_config=None)
 
-    with patch("vllm_ascend.ops.common_fused_moe.get_current_vllm_config", return_value=mock_vllm_config), \
-            patch("vllm_ascend.ops.moe.token_dispatcher.torch.distributed.get_rank", return_value=0), \
-            patch("vllm_ascend.ops.moe.token_dispatcher.get_ascend_soc_version", return_value=None), \
+    with patch("vllm_ascend.ops.fused_moe.fused_moe.get_current_vllm_config", return_value=mock_vllm_config), \
+            patch("vllm_ascend.ops.fused_moe.token_dispatcher.torch.distributed.get_rank", return_value=0), \
+            patch("vllm_ascend.ops.fused_moe.token_dispatcher.get_ascend_soc_version", return_value=None), \
             patch.dict("vllm.distributed.parallel_state.__dict__", _TP=tp_group, _EP=ep_group, _DP=dp_group,
                        _PP=pp_group), \
             patch.dict("vllm_ascend.distributed.parallel_state.__dict__", _MC2=ep_group), \
 
@@ -20,7 +20,7 @@
 from pytest_mock import MockerFixture
 
 from tests.ut.base import PytestBase
-from vllm_ascend.ops.moe.comm_utils import (
+from vllm_ascend.ops.fused_moe.comm_utils import (
     _gather_along_first_dim, async_all_to_all,
     gather_from_sequence_parallel_region)
 
 
@@ -24,9 +24,11 @@
 
 from tests.ut.base import TestBase
 from vllm_ascend.ascend_forward_context import MoECommType
-from vllm_ascend.ops.common_fused_moe import AscendUnquantizedFusedMoEMethod
-from vllm_ascend.ops.moe.experts_selector import select_experts
-from vllm_ascend.ops.moe.moe_mlp import cumsum_group_list, unified_apply_mlp
+from vllm_ascend.ops.fused_moe.experts_selector import select_experts
+from vllm_ascend.ops.fused_moe.fused_moe import (
+    AscendFusedMoE, AscendUnquantizedFusedMoEMethod)
+from vllm_ascend.ops.fused_moe.moe_mlp import (cumsum_group_list,
+                                               unified_apply_mlp)
 from vllm_ascend.utils import AscendSocVersion, adapt_patch
 
 adapt_patch(True)
@@ -69,10 +71,11 @@ def setup_vllm_config_mock(mocker: MockerFixture):
     mock_vllm_config.scheduler_config = MagicMock(max_num_seqs=4)
     mock_vllm_config.model_config.max_model_len = 2048
 
-    mocker.patch('vllm_ascend.ops.common_fused_moe.get_current_vllm_config',
-                 return_value=mock_vllm_config)
-    mocker.patch('vllm_ascend.ops.moe.moe_comm_method.get_current_vllm_config',
+    mocker.patch('vllm_ascend.ops.fused_moe.fused_moe.get_current_vllm_config',
                  return_value=mock_vllm_config)
+    mocker.patch(
+        'vllm_ascend.ops.fused_moe.moe_comm_method.get_current_vllm_config',
+        return_value=mock_vllm_config)
 
 
 @pytest.fixture
@@ -105,37 +108,37 @@ def mock_finalize(hidden_states, **kwargs):
 
     with patch('torch.distributed.get_rank', return_value=0), \
         patch('torch.distributed.get_world_size', return_value=4), \
-        patch('vllm_ascend.ops.common_fused_moe.get_ep_group', return_value=mock_ep_and_mc2_group(mocker)), \
-        patch('vllm_ascend.ops.moe.token_dispatcher.get_ep_group', return_value=mock_ep_and_mc2_group(mocker)), \
-        patch('vllm_ascend.ops.common_fused_moe.get_mc2_group', return_value=mock_ep_and_mc2_group(mocker)), \
-        patch('vllm_ascend.ops.common_fused_moe.get_tp_group', return_value=mock_dp_and_tp_group(mocker)), \
+        patch('vllm_ascend.ops.fused_moe.fused_moe.get_ep_group', return_value=mock_ep_and_mc2_group(mocker)), \
+        patch('vllm_ascend.ops.fused_moe.token_dispatcher.get_ep_group', return_value=mock_ep_and_mc2_group(mocker)), \
+        patch('vllm_ascend.ops.fused_moe.fused_moe.get_mc2_group', return_value=mock_ep_and_mc2_group(mocker)), \
+        patch('vllm_ascend.ops.fused_moe.fused_moe.get_tp_group', return_value=mock_dp_and_tp_group(mocker)), \
         patch('vllm.distributed.parallel_state.get_tp_group', return_value=mock_dp_and_tp_group(mocker)), \
-        patch('vllm_ascend.ops.common_fused_moe.get_dp_group', return_value=mock_dp_and_tp_group(mocker)), \
+        patch('vllm_ascend.ops.fused_moe.fused_moe.get_dp_group', return_value=mock_dp_and_tp_group(mocker)), \
         patch('vllm.model_executor.layers.fused_moe.layer.get_dp_group', return_value=mock_dp_and_tp_group(mocker)), \
         patch('vllm.model_executor.layers.fused_moe.config.get_dp_group',
             return_value=mock_dp_and_tp_group(mocker)), \
-        patch('vllm_ascend.ops.common_fused_moe.get_ascend_config',
+        patch('vllm_ascend.ops.fused_moe.fused_moe.get_ascend_config',
             return_value=MagicMock(
                 torchair_graph_config=MagicMock(enabled=False),
                 enable_multistream_moe=False,
                 expert_map_path=None
             )), \
-        patch('vllm_ascend.ops.common_fused_moe.determine_expert_map',
+        patch('vllm_ascend.ops.fused_moe.fused_moe.determine_expert_map',
             return_value=(3, torch.tensor([0, 1, 2, -1, -1, -1, -1, -1]))), \
-        patch('vllm_ascend.ops.common_fused_moe.get_forward_context',
+        patch('vllm_ascend.ops.fused_moe.fused_moe.get_forward_context',
             return_value=mock_forward_context_obj), \
-        patch('vllm_ascend.ops.moe.fused_moe_prepare_and_finalize.get_forward_context',
+        patch('vllm_ascend.ops.fused_moe.prepare_finalize.get_forward_context',
             return_value=mock_forward_context_obj), \
         patch("vllm_ascend.utils.get_ascend_soc_version", return_value=AscendSocVersion.A3), \
-        patch('vllm_ascend.ops.moe.moe_mlp.get_forward_context',
+        patch('vllm_ascend.ops.fused_moe.moe_mlp.get_forward_context',
                 return_value=mock_forward_context_obj), \
-        patch('vllm_ascend.ops.moe.moe_comm_method.MC2CommImpl._get_token_dispatcher',
+        patch('vllm_ascend.ops.fused_moe.moe_comm_method.MC2CommImpl._get_token_dispatcher',
               return_value=None), \
-        patch('vllm_ascend.ops.moe.moe_comm_method.AlltoAllCommImpl._get_token_dispatcher',
+        patch('vllm_ascend.ops.fused_moe.moe_comm_method.AlltoAllCommImpl._get_token_dispatcher',
               return_value=None), \
-        patch('vllm_ascend.ops.moe.moe_comm_method.AllGatherCommImpl._get_token_dispatcher',
+        patch('vllm_ascend.ops.fused_moe.moe_comm_method.AllGatherCommImpl._get_token_dispatcher',
               return_value=None), \
-        patch('vllm_ascend.ops.moe.experts_selector.get_forward_context',
+        patch('vllm_ascend.ops.fused_moe.experts_selector.get_forward_context',
               return_value=mock_forward_context_obj):
 
         yield {
@@ -319,8 +322,8 @@ def test_cumsum_group_list_with_type_2(self):
 
 class TestUnifiedApplyMLP(TestBase):
 
-    @patch('vllm_ascend.ops.moe.moe_mlp.get_forward_context')
-    @patch('vllm_ascend.ops.moe.moe_mlp.is_310p')
+    @patch('vllm_ascend.ops.fused_moe.moe_mlp.get_forward_context')
+    @patch('vllm_ascend.ops.fused_moe.moe_mlp.is_310p')
     @patch('torch_npu.npu_grouped_matmul')
     @patch('torch_npu.npu_dynamic_quant')
     @patch('torch_npu.npu_dequant_swiglu_quant')
@@ -384,7 +387,7 @@ def test_unified_apply_mlp_with_quantization_mc2(self, mock_npu_dequant,
 
         self.assertEqual(result.dtype, torch.bfloat16)
 
-    @patch('vllm_ascend.ops.moe.moe_mlp.is_310p')
+    @patch('vllm_ascend.ops.fused_moe.moe_mlp.is_310p')
     @patch('torch_npu.npu_grouped_matmul')
     @patch('torch_npu.npu_swiglu')
     @patch('torch_npu.npu_dynamic_quant')
@@ -426,7 +429,7 @@ def test_unified_apply_mlp_without_quantization(self,
         self.assertEqual(result.shape, hidden_states.shape)
         self.assertEqual(result.dtype, torch.float16)
 
-    @patch('vllm_ascend.ops.moe.moe_mlp.get_forward_context')
+    @patch('vllm_ascend.ops.fused_moe.moe_mlp.get_forward_context')
     @patch('torch_npu.npu_grouped_matmul')
     @patch('torch_npu.npu_swiglu')
     @patch('torch_npu.npu_dynamic_quant')
@@ -486,7 +489,7 @@ def test_unified_apply_mlp_with_quantization_and_dynamic_scale(
         self.assertEqual(result.shape, hidden_states.shape)
         self.assertEqual(result.dtype, torch.bfloat16)
 
-    @patch('vllm_ascend.ops.moe.moe_mlp.is_310p')
+    @patch('vllm_ascend.ops.fused_moe.moe_mlp.is_310p')
     @patch('torch_npu.npu_grouped_matmul')
     @patch('torch_npu.npu_swiglu')
     @patch('torch_npu.npu_dynamic_quant')
@@ -531,7 +534,7 @@ def test_unified_apply_mlp_without_quantization_310p(
         self.assertEqual(result.shape, hidden_states.shape)
         self.assertEqual(result.dtype, torch.float16)
 
-    @patch("vllm_ascend.ops.moe.moe_mlp.get_forward_context")
+    @patch("vllm_ascend.ops.fused_moe.moe_mlp.get_forward_context")
     @patch("torch_npu.npu_grouped_matmul")
     @patch("torch_npu.npu_swiglu")
     @patch("torch_npu.npu_grouped_matmul_swiglu_quant")
@@ -595,3 +598,39 @@ def test_unified_apply_mlp_with_quantization_and_fusion_mlp(
         self.assertTrue(mock_forward_context.with_quant)
         self.assertEqual(result.shape, hidden_states.shape)
         self.assertEqual(result.dtype, torch.bfloat16)
+
+
+class TestLoadWeight(TestBase):
+
+    def test_load_w13_transpose(self):
+        with patch.object(AscendFusedMoE, "__init__",
+                          lambda self, *args, **kwargs: None):
+            moe = AscendFusedMoE(num_experts=4, top_k=2, hidden_size=8)
+
+            expert_data = torch.randn(128, 8)
+            loaded_weight = torch.randn(128, 4)
+            moe._load_w13(expert_data, 1, "w1", loaded_weight, 0)
+
+            expert_data = torch.randn(8, 128)
+            loaded_weight = torch.randn(128, 4)
+            moe._load_w13(expert_data, 1, "w1", loaded_weight, 0)
+
+            expert_data = torch.randn(128, 8)
+            loaded_weight = torch.randn(128, 4)
+            moe._load_w13(expert_data, 1, "w3", loaded_weight, 0)
+
+            expert_data = torch.randn(8, 128)
+            loaded_weight = torch.randn(128, 4)
+            moe._load_w13(expert_data, 1, "w3", loaded_weight, 0)
+
+    def test_load_w2_transpose(self):
+        with patch.object(AscendFusedMoE, "__init__",
+                          lambda self, *args, **kwargs: None):
+            moe = AscendFusedMoE(num_experts=4, top_k=2, hidden_size=8)
+            expert_data = torch.randn(128, 4)
+            loaded_weight = torch.randn(128, 8)
+            moe._load_w2(expert_data, 1, loaded_weight, 0)
+
+            expert_data = torch.randn(4, 128)
+            loaded_weight = torch.randn(128, 8)
+            moe._load_w2(expert_data, 1, loaded_weight, 0)
@@ -4,8 +4,9 @@
 from vllm.model_executor.layers.fused_moe import FusedMoEConfig
 
 from tests.ut.base import TestBase
-from vllm_ascend.ops.moe.moe_comm_method import (AllGatherCommImpl,
-                                                 AlltoAllCommImpl, MC2CommImpl)
+from vllm_ascend.ops.fused_moe.moe_comm_method import (AllGatherCommImpl,
+                                                       AlltoAllCommImpl,
+                                                       MC2CommImpl)
 
 
 class TestMoECommMethod(TestBase):
@@ -24,12 +25,14 @@ def setUp(self):
         self.moe_config.dp_group = MagicMock()
         self.moe_config.num_global_redundant_experts = 0
 
-    @patch("vllm_ascend.ops.moe.moe_comm_method.get_current_vllm_config")
-    @patch("vllm_ascend.ops.moe.moe_comm_method.get_forward_context")
+    @patch("vllm_ascend.ops.fused_moe.moe_comm_method.get_current_vllm_config")
+    @patch("vllm_ascend.ops.fused_moe.moe_comm_method.get_forward_context")
     @patch(
-        "vllm_ascend.ops.moe.moe_comm_method.FusedMoEPrepareAndFinalizeWithAllGather"
+        "vllm_ascend.ops.fused_moe.moe_comm_method.PrepareAndFinalizeWithAllGather"
+    )
+    @patch(
+        "vllm_ascend.ops.fused_moe.moe_comm_method.TokenDispatcherWithAllGather"
     )
-    @patch("vllm_ascend.ops.moe.moe_comm_method.TokenDispatcherWithAllGather")
     def test_all_gather_comm_impl(self, mock_token_dispatcher,
                                   mock_prepare_finalize,
                                   mock_get_forward_context,
@@ -72,12 +75,11 @@ def test_all_gather_comm_impl(self, mock_token_dispatcher,
                            context_metadata=context_metadata)
         mock_pf_instance.finalize.assert_called_once_with(h_out, True, None)
 
-    @patch("vllm_ascend.ops.moe.moe_comm_method.get_current_vllm_config")
-    @patch("vllm_ascend.ops.moe.moe_comm_method.get_forward_context")
+    @patch("vllm_ascend.ops.fused_moe.moe_comm_method.get_current_vllm_config")
+    @patch("vllm_ascend.ops.fused_moe.moe_comm_method.get_forward_context")
     @patch(
-        "vllm_ascend.ops.moe.moe_comm_method.FusedMoEPrepareAndFinalizeWithMC2"
-    )
-    @patch("vllm_ascend.ops.moe.moe_comm_method.TokenDispatcherWithMC2")
+        "vllm_ascend.ops.fused_moe.moe_comm_method.PrepareAndFinalizeWithMC2")
+    @patch("vllm_ascend.ops.fused_moe.moe_comm_method.TokenDispatcherWithMC2")
     def test_mc2_comm_impl(self, mock_token_dispatcher, mock_prepare_finalize,
                            mock_get_forward_context,
                            mock_get_current_vllm_config):
@@ -121,12 +123,14 @@ def test_mc2_comm_impl(self, mock_token_dispatcher, mock_prepare_finalize,
                            context_metadata=context_metadata)
         mock_pf_instance.finalize.assert_called_once_with(h_out, True, None)
 
-    @patch("vllm_ascend.ops.moe.moe_comm_method.get_current_vllm_config")
-    @patch("vllm_ascend.ops.moe.moe_comm_method.get_forward_context")
+    @patch("vllm_ascend.ops.fused_moe.moe_comm_method.get_current_vllm_config")
+    @patch("vllm_ascend.ops.fused_moe.moe_comm_method.get_forward_context")
     @patch(
-        "vllm_ascend.ops.moe.moe_comm_method.FusedMoEPrepareAndFinalizeWithAll2All"
+        "vllm_ascend.ops.fused_moe.moe_comm_method.PrepareAndFinalizeWithAll2All"
+    )
+    @patch(
+        "vllm_ascend.ops.fused_moe.moe_comm_method.TokenDispatcherWithAll2AllV"
     )
-    @patch("vllm_ascend.ops.moe.moe_comm_method.TokenDispatcherWithAll2AllV")
     def test_alltoall_comm_impl(self, mock_token_dispatcher,
                                 mock_prepare_finalize,
                                 mock_get_forward_context,
@@ -163,13 +167,15 @@ def test_alltoall_comm_impl(self, mock_token_dispatcher,
         mock_pf_instance.prepare.assert_called_once_with(
             hidden_states, router_logits, False, False, None)
 
-    @patch("vllm_ascend.ops.moe.moe_comm_method.get_current_vllm_config")
-    @patch("vllm_ascend.ops.moe.moe_comm_method.get_forward_context")
+    @patch("vllm_ascend.ops.fused_moe.moe_comm_method.get_current_vllm_config")
+    @patch("vllm_ascend.ops.fused_moe.moe_comm_method.get_forward_context")
+    @patch(
+        "vllm_ascend.ops.fused_moe.moe_comm_method.PrepareAndFinalizeWithAllGather"
+    )
     @patch(
-        "vllm_ascend.ops.moe.moe_comm_method.FusedMoEPrepareAndFinalizeWithAllGather"
+        "vllm_ascend.ops.fused_moe.moe_comm_method.TokenDispatcherWithAllGather"
     )
-    @patch("vllm_ascend.ops.moe.moe_comm_method.TokenDispatcherWithAllGather")
-    @patch("vllm_ascend.ops.moe.moe_comm_method.unified_apply_mlp")
+    @patch("vllm_ascend.ops.fused_moe.moe_comm_method.unified_apply_mlp")
     def test_fused_experts_method(self, mock_unified_apply_mlp,
                                   mock_token_dispatcher, mock_prepare_finalize,
                                   mock_get_forward_context,