|
10 | 10 | from typing import Tuple |
11 | 11 |
|
12 | 12 | import torch |
13 | | -import torch.nn as nn |
14 | | -import torch.nn.functional as F |
15 | 13 | from torch.testing._internal import common_utils |
16 | 14 | from torch.testing._internal.common_utils import ( |
17 | 15 | TestCase, |
|
28 | 26 | ) |
29 | 27 | from torchao.quantization.quantize_.common import KernelPreference |
30 | 28 | from torchao.quantization.utils import compute_error |
| 29 | +from torchao.testing.model_architectures import Experts |
31 | 30 | from torchao.utils import ( |
32 | 31 | TORCH_VERSION_AT_LEAST_2_8, |
33 | 32 | _is_fbgemm_genai_gpu_available, |
|
39 | 38 | torch._dynamo.config.cache_size_limit = 128 |
40 | 39 |
|
41 | 40 |
|
42 | | -class Experts(nn.Module): |
43 | | - def __init__( |
44 | | - self, |
45 | | - num_local_experts: int, |
46 | | - dim: int, |
47 | | - hidden_dim: int, |
48 | | - dtype: torch.dtype, |
49 | | - device: torch.device, |
50 | | - ) -> None: |
51 | | - super().__init__() |
52 | | - |
53 | | - self.num_local_experts = num_local_experts |
54 | | - self.dim = dim |
55 | | - |
56 | | - self.w1: nn.Parameter = nn.Parameter( |
57 | | - torch.randn( |
58 | | - num_local_experts, |
59 | | - dim, |
60 | | - hidden_dim, |
61 | | - dtype=dtype, |
62 | | - device=device, |
63 | | - ) |
64 | | - ) |
65 | | - |
66 | | - self.w2: nn.Parameter = nn.Parameter( |
67 | | - torch.randn( |
68 | | - num_local_experts, |
69 | | - hidden_dim, |
70 | | - dim, |
71 | | - dtype=dtype, |
72 | | - device=device, |
73 | | - ) |
74 | | - ) |
75 | | - |
76 | | - self.w3: nn.Parameter = nn.Parameter( |
77 | | - torch.randn( |
78 | | - num_local_experts, |
79 | | - dim, |
80 | | - hidden_dim, |
81 | | - dtype=dtype, |
82 | | - device=device, |
83 | | - ) |
84 | | - ) |
85 | | - |
86 | | - def forward( |
87 | | - self, |
88 | | - routed_in_egD: torch.Tensor, # noqa: N803 |
89 | | - ) -> torch.Tensor: |
90 | | - e = self.num_local_experts |
91 | | - D = self.dim |
92 | | - |
93 | | - x_egD = routed_in_egD.view(e, -1, D) |
94 | | - |
95 | | - middle_out_egF = F.silu(torch.bmm(x_egD, self.w1)) * torch.bmm(x_egD, self.w3) |
96 | | - out_egD = torch.bmm(middle_out_egF, self.w2) |
97 | | - out_egD = out_egD.view(-1, D) |
98 | | - |
99 | | - return out_egD |
100 | | - |
101 | | - |
102 | 41 | class ToyLinearModel(torch.nn.Module): |
103 | 42 | def __init__(self, in_features, out_features): |
104 | 43 | super().__init__() |
|
0 commit comments