pytorch
diff --git a/‎test/quantization/quantize_/workflows/float8/test_float8_tensor.py‎
Lines changed: 1 addition & 62 deletions b/‎test/quantization/quantize_/workflows/float8/test_float8_tensor.py‎
Lines changed: 1 addition & 62 deletions
@@ -10,8 +10,6 @@
 from typing import Tuple
 
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
 from torch.testing._internal import common_utils
 from torch.testing._internal.common_utils import (
     TestCase,
@@ -28,6 +26,7 @@
 )
 from torchao.quantization.quantize_.common import KernelPreference
 from torchao.quantization.utils import compute_error
+from torchao.testing.model_architectures import Experts
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_8,
     _is_fbgemm_genai_gpu_available,
@@ -39,66 +38,6 @@
 torch._dynamo.config.cache_size_limit = 128
 
 
-class Experts(nn.Module):
-    def __init__(
-        self,
-        num_local_experts: int,
-        dim: int,
-        hidden_dim: int,
-        dtype: torch.dtype,
-        device: torch.device,
-    ) -> None:
-        super().__init__()
-
-        self.num_local_experts = num_local_experts
-        self.dim = dim
-
-        self.w1: nn.Parameter = nn.Parameter(
-            torch.randn(
-                num_local_experts,
-                dim,
-                hidden_dim,
-                dtype=dtype,
-                device=device,
-            )
-        )
-
-        self.w2: nn.Parameter = nn.Parameter(
-            torch.randn(
-                num_local_experts,
-                hidden_dim,
-                dim,
-                dtype=dtype,
-                device=device,
-            )
-        )
-
-        self.w3: nn.Parameter = nn.Parameter(
-            torch.randn(
-                num_local_experts,
-                dim,
-                hidden_dim,
-                dtype=dtype,
-                device=device,
-            )
-        )
-
-    def forward(
-        self,
-        routed_in_egD: torch.Tensor,  # noqa: N803
-    ) -> torch.Tensor:
-        e = self.num_local_experts
-        D = self.dim
-
-        x_egD = routed_in_egD.view(e, -1, D)
-
-        middle_out_egF = F.silu(torch.bmm(x_egD, self.w1)) * torch.bmm(x_egD, self.w3)
-        out_egD = torch.bmm(middle_out_egF, self.w2)
-        out_egD = out_egD.view(-1, D)
-
-        return out_egD
-
-
 class ToyLinearModel(torch.nn.Module):
     def __init__(self, in_features, out_features):
         super().__init__()