ecmwf
diff --git a/‎config/default_config.yml‎
Lines changed: 28 additions & 8 deletions b/‎config/default_config.yml‎
Lines changed: 28 additions & 8 deletions
diff --git a/‎src/weathergen/model/blocks.py‎
Lines changed: 96 additions & 29 deletions b/‎src/weathergen/model/blocks.py‎
Lines changed: 96 additions & 29 deletions
diff --git a/‎src/weathergen/model/engines.py‎
Lines changed: 82 additions & 22 deletions b/‎src/weathergen/model/engines.py‎
Lines changed: 82 additions & 22 deletions
@@ -24,7 +24,7 @@ ae_adapter_with_residual: True
 ae_adapter_dropout_rate: 0.1
 
 ae_global_dim_embed: 2048
-ae_global_num_blocks: 8
+ae_global_num_blocks: 2
 ae_global_num_heads: 32
 ae_global_dropout_rate: 0.1
 ae_global_with_qk_lnorm: True
@@ -42,12 +42,12 @@ pred_mlp_adaln: True
 
 # number of steps offset applied to first target window; if set to zero and forecast_steps=0 then
 # one is training an auto-encoder
-forecast_offset : 0
+forecast_offset : 1
 forecast_delta_hrs: 0
-forecast_steps: 0
-forecast_policy: null
+forecast_steps: 1
+forecast_policy: "fixed"
 forecast_att_dense_rate: 1.0
-fe_num_blocks: 0
+fe_num_blocks: 2
 fe_num_heads: 16
 fe_dropout_rate: 0.1
 fe_with_qk_lnorm: True
@@ -93,7 +93,7 @@ ema_halflife_in_thousands: 1e-3
 
 # training mode: "forecast" or "masking" (masked token modeling)
 # for "masking" to train with auto-encoder mode, forecast_offset should be 0
-training_mode: "masking"
+training_mode: "forecast"
 # masking rate when training mode is "masking"; ignored in foreacast mode
 masking_rate: 0.6
 # sample the masking rate (with normal distribution centered at masking_rate)
@@ -168,8 +168,28 @@ train_log:
 
 # Forecast MLP type: "dense" (default) or "moe"
 fe_mlp_type: "dense"    # set to "moe" to enable MoE
+ae_global_mlp_type: "dense"    # set to "moe" to enable MoE
+ffn_mlp_type: "dense"  # set to "moe" to enable MoE in the feed-forward network of the decoder blocks
+decoder_mlp_type: "dense"  # set to "moe" to enable MoE in the decoder prediction MLP
+moe_lambda: 0.02  # coefficient for the MoE load balancing loss
 
 # MoE-only params (ignored when fe_mlp_type != "moe")
-fe_moe_num_experts: 8
-fe_moe_top_k: 2
+fe_moe_num_experts: 2
+fe_moe_top_k: 1
 fe_moe_hidden_factor: 0.5   # = HF_dense / 4
+
+# MoE-only params (ignored when ae_global_mlp_type != "moe")
+ae_global_moe_num_experts: 4
+ae_global_moe_top_k: 2
+ae_global_moe_hidden_factor: 0.5   # = HF_dense / 4
+
+# MoE-only params (ignored when ffn_mlp_type != "moe")
+ffn_moe_num_experts: 2
+ffn_moe_top_k: 1
+ffn_moe_hidden_factor: 0.5   # = HF_dense / 4
+
+# MoE-only params (ignored when decoder_mlp_type != "moe")
+decoder_moe_num_experts: 2
+decoder_moe_top_k: 1
+decoder_moe_hidden_factor: 0.5   # = HF_dense / 4
+tr_mlp_hidden_factor: 2
@@ -14,10 +14,12 @@
     MultiCrossAttentionHeadVarlen,
     MultiSelfAttentionHeadVarlen,
 )
-from weathergen.model.layers import MLP
+from weathergen.model.layers import MLP, MoEMLP
 from weathergen.model.norms import AdaLayerNormLayer
 from weathergen.utils.utils import get_dtype
 
+import logging
+logger = logging.getLogger(__name__)
 
 class SelfAttentionBlock(nn.Module):
     """
@@ -43,14 +45,32 @@ def __init__(self, dim, dim_aux, with_adanorm, num_heads, dropout_rate, **kwargs
             self.mhsa_block = lambda x, _, **kwargs: self.mhsa(self.ln_sa(x), **kwargs) + x
 
         approx_gelu = lambda: nn.GELU(approximate="tanh")
-        self.mlp = MLP(
-            dim_in=dim,
-            dim_out=dim,
-            hidden_factor=4,
-            dropout_rate=0.1,
-            nonlin=approx_gelu,
-            with_residual=False,
-        )
+        use_moe_ffn = (kwargs.get("ffn_mlp_type", "dense") == "moe")
+        ffn_hidden_factor = kwargs.get("ffn_hidden_factor", 4)
+        moe_kwargs = kwargs.get("moe_kwargs", {})  # e.g. num_experts, top_k, router_noisy_std
+        
+        if use_moe_ffn:
+            self.mlp = MoEMLP(
+                dim_in=dim,
+                dim_out=dim,
+                hidden_factor=ffn_hidden_factor,
+                dropout_rate=0.1,
+                nonlin=nn.GELU,              # internal block constructs nonlin()
+                with_residual=False,
+                norm_type=kwargs["attention_kwargs"]["norm_type"],
+                dim_aux=(dim_aux if self.with_adanorm else None),
+                norm_eps=kwargs["attention_kwargs"]["norm_eps"],
+                **moe_kwargs,                # <- e.g. num_experts=8, top_k=2, router_noisy_std=0.01
+            )
+        else:
+            self.mlp = MLP(
+                dim_in=dim,
+                dim_out=dim,
+                hidden_factor=4,
+                dropout_rate=0.1,
+                nonlin=approx_gelu,
+                with_residual=False,
+            )
         if self.with_adanorm:
             self.mlp_fn = lambda x, **kwargs: self.mlp(x)
             self.mlp_block = AdaLayerNormLayer(dim, dim_aux, self.mlp_fn, dropout_rate)
@@ -104,7 +124,7 @@ def __init__(
 
         self.with_adanorm = with_adanorm
         self.with_self_attn = with_self_attn
-        self.with_mlp = with_self_attn
+        self.with_mlp = with_mlp
 
         if with_self_attn:
             self.mhsa = MultiSelfAttentionHeadVarlen(
@@ -136,18 +156,37 @@ def __init__(
 
         if self.with_mlp:
             approx_gelu = lambda: nn.GELU(approximate="tanh")
-            self.mlp = MLP(
-                dim_in=dim_q,
-                dim_out=dim_q,
-                hidden_factor=4,
-                nonlin=approx_gelu,
-                with_residual=False,
-            )
+            
+            use_moe_ffn = (kwargs.get("ffn_mlp_type", "dense") == "moe")
+            ffn_hidden_factor = kwargs.get("ffn_hidden_factor", 4)
+            moe_kwargs = kwargs.get("moe_kwargs", {})
+            
+            if use_moe_ffn:
+                self.mlp = MoEMLP(
+                    dim_in=dim_q,
+                    dim_out=dim_q,
+                    hidden_factor=ffn_hidden_factor,
+                    dropout_rate=0.1,
+                    nonlin=nn.GELU,          # internal block constructs nonlin()
+                    with_residual=False,
+                    norm_type=kwargs["attention_kwargs"]["norm_type"],
+                    dim_aux=(dim_aux if self.with_adanorm else None),
+                    norm_eps=kwargs["attention_kwargs"]["norm_eps"],
+                    **moe_kwargs,            # <- e.g. num_experts=8, top_k=2, router_noisy_std=0.01
+                )
+            else:
+                self.mlp = MLP(
+                    dim_in=dim_q,
+                    dim_out=dim_q,
+                    hidden_factor=4,
+                    nonlin=approx_gelu,
+                    with_residual=False,
+                )
             if self.with_adanorm:
                 self.mlp_fn = lambda x, **kwargs: self.mlp(x)
                 self.mlp_block = AdaLayerNormLayer(dim_q, dim_aux, self.mlp_fn, dropout_rate)
             else:
-                self.ln_mlp = nn.LayerNorm(dim_q, eps=kwargs["attention_kwargs"]["norm_eps"])
+                self.ln_mlp = nn.LayerNorm(eps=kwargs["attention_kwargs"]["norm_eps"])
                 self.mlp_block = lambda x, _, **kwargs: self.mlp(self.ln_mlp(x)) + x
         else:
             self.mlp_block = lambda x, _, **kwargs: x
@@ -191,6 +230,7 @@ def __init__(
         tr_mlp_hidden_factor,
         tro_type,
         mlp_norm_eps=1e-6,
+        **kwargs,
     ):
         super().__init__()
 
@@ -237,19 +277,46 @@ def __init__(
             )
 
         # MLP Block
-        self.block.append(
-            MLP(
-                dim_in,
-                dim_out,
-                with_residual=True,
-                hidden_factor=self.tr_mlp_hidden_factor,
-                dropout_rate=0.1,  # Assuming dropout_rate is 0.1
-                norm_type=self.cf.norm_type,
-                dim_aux=(dim_aux if self.cf.pred_mlp_adaln else None),
-                norm_eps=self.cf.mlp_norm_eps,
-            )
+        # Add MoE option
+        use_moe = getattr(self.cf, "decoder_mlp_type", "dense") == "moe"
+        logger.info(
+            "[MoE] Decoder head: type=%s%s",
+            "moe" if use_moe else "dense",
+            ("" if not use_moe else
+            f" (experts={getattr(self.cf,'moe_num_experts',None)}, top_k={getattr(self.cf,'moe_top_k',None)})"),
         )
 
+        if use_moe:
+            self.block.append(
+                MoEMLP(
+                    dim_in,
+                    dim_out,
+                    hidden_factor=self.tr_mlp_hidden_factor,
+                    dropout_rate=0.1,
+                    with_residual=True,                        # mirror dense
+                    norm_type=self.cf.norm_type,
+                    dim_aux=(dim_aux if self.cf.pred_mlp_adaln else None),
+                    norm_eps=self.cf.mlp_norm_eps,
+                    num_experts=getattr(self.cf, "moe_num_experts", 8),
+                    top_k=getattr(self.cf, "moe_top_k", 2),
+                    router_noisy_std=getattr(self.cf, "moe_router_noisy_std", 0.0),
+                    use_checkpoint=getattr(self.cf, "moe_use_checkpoint", False),
+                )
+            )
+        else:
+            self.block.append(
+                MLP(
+                    dim_in,
+                    dim_out,
+                    with_residual=True,
+                    hidden_factor=self.tr_mlp_hidden_factor,
+                    dropout_rate=0.1,  # Assuming dropout_rate is 0.1
+                    norm_type=self.cf.norm_type,
+                    dim_aux=(dim_aux if self.cf.pred_mlp_adaln else None),
+                    norm_eps=self.cf.mlp_norm_eps,
+                )
+            )
+
     def forward(self, latent, output, coords, latent_lens, output_lens):
         for layer in self.block:
             if isinstance(layer, MultiCrossAttentionHeadVarlen):
 
@@ -24,10 +24,12 @@
     StreamEmbedLinear,
     StreamEmbedTransformer,
 )
-from weathergen.model.layers import MLP
+from weathergen.model.layers import MLP, MoEMLP
 from weathergen.model.utils import ActivationFactory
 from weathergen.utils.utils import get_dtype
 
+import logging
+logger = logging.getLogger(__name__)
 
 class EmbeddingEngine:
     name: "EmbeddingEngine"
@@ -249,17 +251,50 @@ def create(self) -> torch.nn.ModuleList:
                     )
                 )
             # MLP block
-            self.ae_global_blocks.append(
-                MLP(
-                    self.cf.ae_global_dim_embed,
-                    self.cf.ae_global_dim_embed,
-                    with_residual=True,
-                    dropout_rate=self.cf.ae_global_dropout_rate,
-                    hidden_factor=self.cf.ae_global_mlp_hidden_factor,
-                    norm_type=self.cf.norm_type,
-                    norm_eps=self.cf.mlp_norm_eps,
-                )
+            # Add MoE option
+            use_moe = getattr(self.cf, "ae_global_mlp_type", "dense") == "moe"
+            mlp_common_kwargs = dict(
+                dim_in=self.cf.ae_global_dim_embed,
+                dim_out=self.cf.ae_global_dim_embed,
+                with_residual=True,
+                dropout_rate=self.cf.ae_global_dropout_rate,
+                norm_type=self.cf.norm_type,
+                norm_eps=self.cf.mlp_norm_eps,
             )
+            if use_moe:
+                self.ae_global_blocks.append(
+                    MoEMLP(
+                        **mlp_common_kwargs,
+                        num_experts=getattr(self.cf, "ae_global_moe_num_experts", 2),
+                        top_k=getattr(self.cf, "ae_global_moe_top_k", 1),
+                        router_noisy_std=getattr(self.cf, "ae_global_moe_router_noisy_std", 0.0),
+                        hidden_factor=getattr(self.cf, "ae_global_moe_hidden_factor", 2),
+                    )
+                )
+            else:
+                self.ae_global_blocks.append(
+                    MLP(
+                        self.cf.ae_global_dim_embed,
+                        self.cf.ae_global_dim_embed,
+                        with_residual=True,
+                        dropout_rate=self.cf.ae_global_dropout_rate,
+                        hidden_factor=self.cf.ae_global_mlp_hidden_factor,
+                        norm_type=self.cf.norm_type,
+                        norm_eps=self.cf.mlp_norm_eps,
+                    )
+                )
+        # Count MoE blocks
+        num_moe = sum(1 for m in self.ae_global_blocks if isinstance(m, MoEMLP))
+        logger.info(
+            "[MoE] GlobalAssimilationEngine: %d MoEMLP blocks "
+            "(ae_global_mlp_type=%s, experts=%s, top_k=%s, hidden_factor=%s)",
+            num_moe,
+            getattr(self.cf, "ae_global_mlp_type", "dense"),
+            getattr(self.cf, "ae_global_moe_num_experts", None),
+            getattr(self.cf, "ae_global_moe_top_k", None),
+            getattr(self.cf, "ae_global_moe_hidden_factor", None),
+        )
+
         return self.ae_global_blocks
 
 
@@ -343,8 +378,8 @@ def create(self) -> torch.nn.ModuleList:
                     self.fe_blocks.append(
                         MoEMLP(
                             **mlp_common_kwargs,
-                            num_experts=getattr(self.cf, "fe_moe_num_experts", 8),
-                            top_k=getattr(self.cf, "fe_moe_top_k", 4),
+                            num_experts=getattr(self.cf, "fe_moe_num_experts", 2),
+                            top_k=getattr(self.cf, "fe_moe_top_k", 2),
                             router_noisy_std=getattr(self.cf, "fe_moe_router_noisy_std", 0.0),
                             hidden_factor=getattr(self.cf, "fe_moe_hidden_factor", 2),
                         )
@@ -362,15 +397,24 @@ def create(self) -> torch.nn.ModuleList:
                         )
                     )
                 # ------------------------------------------------------------------
-        def init_weights_final(m):
-            if isinstance(m, torch.nn.Linear):
-                torch.nn.init.normal_(m.weight, mean=0, std=0.001)
-                if m.bias is not None:
-                    torch.nn.init.normal_(m.bias, mean=0, std=0.001)
-
-        for block in self.fe_blocks:
-            block.apply(init_weights_final)
-
+        # def init_weights_final(m):
+        #     if isinstance(m, torch.nn.Linear) and not getattr(m, "is_moe_router", False):
+        #         torch.nn.init.normal_(m.weight, mean=0, std=0.001)
+        #         if m.bias is not None:
+        #             torch.nn.init.normal_(m.bias, mean=0, std=0.001)
+
+        # for block in self.fe_blocks:
+        #     block.apply(init_weights_final)
+        num_moe = sum(1 for m in self.fe_blocks if isinstance(m, MoEMLP))
+        logger.info(
+            "[MoE] ForecastingEngine: %d MoEMLP blocks "
+            "(fe_mlp_type=%s, experts=%s, top_k=%s, hidden_factor=%s)",
+            num_moe,
+            getattr(self.cf, "fe_mlp_type", "dense"),
+            getattr(self.cf, "fe_moe_num_experts", None),
+            getattr(self.cf, "fe_moe_top_k", None),
+            getattr(self.cf, "fe_moe_hidden_factor", None),
+        )
         return self.fe_blocks
 
 
@@ -619,6 +663,14 @@ def __init__(
                         with_adanorm=False,
                         with_mlp=False,
                         attention_kwargs=attention_kwargs,
+                        ffn_mlp_type=getattr(self.cf, "decoder_ffn_mlp_type", "dense"),
+                        ffn_hidden_factor=getattr(self.cf, "decoder_ffn_hidden_factor", 4),
+                        moe_kwargs=dict(
+                            num_experts=getattr(self.cf, "decoder_moe_num_experts", 2),
+                            top_k=getattr(self.cf, "decoder_moe_top_k", 2),
+                            router_noisy_std=getattr(self.cf, "decoder_moe_router_noisy_std", 0.0),
+                            use_checkpoint=getattr(self.cf, "decoder_moe_use_checkpoint", False),
+                        )
                     )
                 )
             elif self.cf.decoder_type == "AdaLayerNormConditioning":
@@ -674,6 +726,14 @@ def __init__(
                         tr_mlp_hidden_factor=tr_mlp_hidden_factor,
                         tro_type=tro_type,
                         mlp_norm_eps=self.cf.mlp_norm_eps,
+                        ffn_mlp_type=getattr(self.cf, "decoder_ffn_mlp_type", "dense"),
+                        ffn_hidden_factor=getattr(self.cf, "decoder_ffn_hidden_factor", 4),
+                        moe_kwargs=dict(
+                            num_experts=getattr(self.cf, "decoder_moe_num_experts", 2),
+                            top_k=getattr(self.cf, "decoder_moe_top_k", 2),
+                            router_noisy_std=getattr(self.cf, "decoder_moe_router_noisy_std", 0.0),
+                            use_checkpoint=getattr(self.cf, "decoder_moe_use_checkpoint", False),
+                        )
                     )
                 )
             else: