From a0039ecfebd8634588a3f5afa44e66f712fa365f Mon Sep 17 00:00:00 2001
From: sophiex <24638638+sophie-xhonneux@users.noreply.github.com>
Date: Wed, 6 Aug 2025 12:24:36 +0000
Subject: [PATCH 01/19] Log gradient norms

---
 src/weathergen/train/trainer.py      | 18 ++++++++++++++++--
 src/weathergen/utils/train_logger.py |  2 ++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py
index 56a28d089..46653b9e9 100644
--- a/src/weathergen/train/trainer.py
+++ b/src/weathergen/train/trainer.py
@@ -76,6 +76,7 @@ def init(
 
         self.init_perf_monitoring()
         self.train_logger = TrainLogger(cf, config.get_path_run(self.cf))
+        self.last_grad_norm = 0.0
 
     def inference(self, cf, run_id_trained, epoch):
         # general initalization
@@ -482,7 +483,19 @@ def train(self, epoch):
 
             # gradient clipping
             self.grad_scaler.unscale_(self.optimizer)
-            torch.nn.utils.clip_grad_norm_(self.ddp_model.parameters(), max_norm=cf.grad_clip)
+            total_norm = torch.nn.utils.clip_grad_norm_(
+                self.ddp_model.parameters(), max_norm=cf.grad_clip
+            )
+
+            # log gradient norms
+            if bidx % log_interval == 0:
+                grad_norms = { "total_grad_norm" : total_norm.item() }
+                self.last_grad_norm = total_norm.item()
+                for name, param in self.ddp_model.named_parameters():
+                    if param.grad is not None:
+                        grad_norms[name] = param.grad.norm().item()
+                self.train_logger.log_metrics(TRAIN, grad_norms)
+
 
             # optimizer step
             self.grad_scaler.step(self.optimizer)
@@ -718,7 +731,7 @@ def _log_terminal(self, bidx: int, epoch: int, stage: Stage):
                     # samples per sec
                     dt = time.time() - self.t_start
                     pstr = "{:03d} : {:05d}/{:05d} : {:06d} : loss = {:.4E} "
-                    pstr += "(lr={:.2E}, s/sec={:.3f})"
+                    pstr += "(lr={:.2E}, gradient norm={:.3f}, s/sec={:.3f})"
                     len_dataset = len(self.data_loader) // self.cf.batch_size_per_gpu
                     print(
                         pstr.format(
@@ -728,6 +741,7 @@ def _log_terminal(self, bidx: int, epoch: int, stage: Stage):
                             self.cf.istep,
                             avg_loss.nanmean().item(),
                             self.lr_scheduler.get_lr(),
+                            self.last_grad_norm,
                             (self.print_freq * self.cf.batch_size_per_gpu) / dt,
                         ),
                         flush=True,
diff --git a/src/weathergen/utils/train_logger.py b/src/weathergen/utils/train_logger.py
index be70a243b..c4db39172 100644
--- a/src/weathergen/utils/train_logger.py
+++ b/src/weathergen/utils/train_logger.py
@@ -146,6 +146,8 @@ def add_train(
             metrics[_performance_gpu] = perf_gpu
         if perf_mem > 0.0:
             metrics[_performance_memory] = perf_mem
+
+
         self.log_metrics("train", metrics)
         with open(self.path_run / (self.cf.run_id + "_perf_log.txt"), "ab") as f:
             np.savetxt(f, log_vals)

From e83903b5f6799854933550dbe3ef4b0ac36b227c Mon Sep 17 00:00:00 2001
From: sophiex <24638638+sophie-xhonneux@users.noreply.github.com>
Date: Wed, 6 Aug 2025 15:24:05 +0000
Subject: [PATCH 02/19] Prototype for recording grad norms

---
 pyproject.toml                          |   1 +
 src/weathergen/train/trainer.py         |   2 +-
 src/weathergen/utils/plot_grad_norms.py | 483 ++++++++++++++++++++++++
 uv.lock                                 |  16 +
 4 files changed, 501 insertions(+), 1 deletion(-)
 create mode 100644 src/weathergen/utils/plot_grad_norms.py

diff --git a/pyproject.toml b/pyproject.toml
index aa6232bb8..7511b0327 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,7 @@ dependencies = [
  "dask~=2025.5.1",
  "hatchling",
  "weathergen-common",
+ "seaborn>=0.13.2",
 ]
 
 [project.urls]
diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py
index 46653b9e9..4430211ac 100644
--- a/src/weathergen/train/trainer.py
+++ b/src/weathergen/train/trainer.py
@@ -493,7 +493,7 @@ def train(self, epoch):
                 self.last_grad_norm = total_norm.item()
                 for name, param in self.ddp_model.named_parameters():
                     if param.grad is not None:
-                        grad_norms[name] = param.grad.norm().item()
+                        grad_norms["grad_norm_" + name] = param.grad.norm().item()
                 self.train_logger.log_metrics(TRAIN, grad_norms)
 
 
diff --git a/src/weathergen/utils/plot_grad_norms.py b/src/weathergen/utils/plot_grad_norms.py
new file mode 100644
index 000000000..8a6ded4ac
--- /dev/null
+++ b/src/weathergen/utils/plot_grad_norms.py
@@ -0,0 +1,483 @@
+import json
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from pathlib import Path
+import seaborn as sns
+from collections import defaultdict
+import re
+
+class GradientNormsAnalyzer:
+    def __init__(self, json_file_path):
+        """
+        Initialize the analyzer with path to JSON file containing gradient norms.
+        Expected format: one JSON object per line with step info and gradient norms.
+        """
+        self.json_file_path = Path(json_file_path)
+        self.data = []
+        self.df = None
+        self.load_data()
+        
+    def load_data(self):
+        """Load and parse the JSON data from file."""
+        print(f"Loading data from {self.json_file_path}...")
+        
+        with open(self.json_file_path, 'r') as f:
+            for line_num, line in enumerate(f, 1):
+                try:
+                    data_point = json.loads(line.strip())
+                    self.data.append(data_point)
+                except json.JSONDecodeError as e:
+                    print(f"Warning: Could not parse line {line_num}: {e}")
+                    
+        print(f"Loaded {len(self.data)} data points")
+        self.create_dataframe()
+    
+    def create_dataframe(self):
+        """Convert loaded data into a pandas DataFrame for easier analysis."""
+        rows = []
+        
+        for ith, entry in enumerate(self.data):
+            # step = entry.get('num_samples', entry.get('epoch', 0))
+            step = ith * 5
+            
+            # Handle different possible data structures
+            if 'gradients' in entry:
+                grad_data = entry['gradients']
+            elif 'grad_norms' in entry:
+                grad_data = entry['grad_norms']
+            else:
+                # Assume all keys except step/epoch are gradient data
+                grad_data = {k: v for k, v in entry.items() 
+                           if 'stream' not in k and ('q_cells' in k or '0' in k)}
+            
+            for param_name, norm_value in grad_data.items():
+                rows.append({
+                    'num_samples': step,
+                    'parameter': param_name,
+                    'grad_norm': float(norm_value),
+                    'layer_type': self.extract_layer_type(param_name),
+                    'layer_depth': self.extract_layer_depth(param_name)
+                })
+       
+        self.df = pd.DataFrame(rows)
+        print(f"Created DataFrame with {len(self.df)} gradient norm records")
+    
+    def extract_layer_type(self, param_name):
+        """Extract layer type from parameter name."""
+        param_name_lower = param_name.lower()
+        
+        # Handle your specific naming patterns
+        if param_name_lower.startswith('embeds.'):
+            if '.embed.' in param_name_lower:
+                return 'embedding'
+            elif '.unembed.' in param_name_lower:
+                return 'unembedding'
+            elif '.ln_final.' in param_name_lower:
+                return 'layer_norm_final'
+            elif 'proj_heads_q' in param_name_lower:
+                return 'attention_q'
+            elif 'proj_heads_k' in param_name_lower:
+                return 'attention_k'
+            elif 'proj_heads_v' in param_name_lower:
+                return 'attention_v'
+            elif 'proj_out' in param_name_lower:
+                return 'attention_out'
+            elif '.layers.' in param_name_lower and ('weight' in param_name_lower or 'bias' in param_name_lower):
+                return 'ffn'
+            else:
+                return 'embeds_other'
+        
+        elif param_name_lower.startswith('ae_local_blocks.'):
+            if 'proj_heads_q' in param_name_lower:
+                return 'ae_local_attention_q'
+            elif 'proj_heads_k' in param_name_lower:
+                return 'ae_local_attention_k'
+            elif 'proj_heads_v' in param_name_lower:
+                return 'ae_local_attention_v'
+            elif 'proj_out' in param_name_lower:
+                return 'ae_local_attention_out'
+            elif '.layers.' in param_name_lower:
+                return 'ae_local_ffn'
+            else:
+                return 'ae_local_other'
+        
+        elif param_name_lower.startswith('ae_global_blocks.'):
+            if 'proj_heads_q' in param_name_lower:
+                return 'ae_global_attention_q'
+            elif 'proj_heads_k' in param_name_lower:
+                return 'ae_global_attention_k'
+            elif 'proj_heads_v' in param_name_lower:
+                return 'ae_global_attention_v'
+            elif 'proj_out' in param_name_lower:
+                return 'ae_global_attention_out'
+            elif '.layers.' in param_name_lower:
+                return 'ae_global_ffn'
+            else:
+                return 'ae_global_other'
+        
+        elif param_name_lower.startswith('ae_adapter.'):
+            if 'proj_heads_q' in param_name_lower:
+                return 'ae_adapter_attention_q'
+            elif 'proj_heads_k' in param_name_lower:
+                return 'ae_adapter_attention_k'
+            elif 'proj_heads_v' in param_name_lower:
+                return 'ae_adapter_attention_v'
+            elif 'proj_out' in param_name_lower:
+                return 'ae_adapter_attention_out'
+            elif '.layers.' in param_name_lower:
+                return 'ae_adapter_ffn'
+            else:
+                return 'ae_adapter_other'
+        
+        elif param_name_lower.startswith('target_token_engines.'):
+            if 'proj_heads_q' in param_name_lower:
+                return 'tte_attention_q'
+            elif 'proj_heads_k' in param_name_lower:
+                return 'tte_attention_k'
+            elif 'proj_heads_v' in param_name_lower:
+                return 'tte_attention_v'
+            elif 'proj_out' in param_name_lower:
+                return 'tte_attention_out'
+            elif 'embed_aux' in param_name_lower:
+                return 'tte_embed_aux'
+            elif 'lnorm' in param_name_lower:
+                return 'tte_layer_norm'
+            elif '.layers.' in param_name_lower:
+                return 'tte_ffn'
+            else:
+                return 'tte_other'
+        
+        elif param_name_lower.startswith('embed_target_coords.'):
+            return 'target_coords_embedding'
+        
+        elif param_name_lower.startswith('pred_heads.'):
+            return 'prediction_head'
+        
+        # Fallback for standard patterns (if any)
+        elif 'embed' in param_name_lower:
+            return 'embedding'
+        elif 'attention' in param_name_lower or 'attn' in param_name_lower:
+            if 'q_proj' in param_name_lower or 'query' in param_name_lower:
+                return 'attention_q'
+            elif 'k_proj' in param_name_lower or 'key' in param_name_lower:
+                return 'attention_k'
+            elif 'v_proj' in param_name_lower or 'value' in param_name_lower:
+                return 'attention_v'
+            elif 'o_proj' in param_name_lower or 'out' in param_name_lower:
+                return 'attention_out'
+            else:
+                return 'attention'
+        elif 'layernorm' in param_name_lower or 'layer_norm' in param_name_lower or 'ln' in param_name_lower:
+            return 'layernorm'
+        else:
+            return 'other'
+    
+    def extract_layer_depth(self, param_name):
+        """Extract layer depth/index from parameter name."""
+        param_name_lower = param_name.lower()
+        
+        # Look for patterns specific to your architecture
+        patterns = [
+            # embeds.0.layers.N.* (transformer layers within embeds)
+            r'embeds\.\d+\.layers\.(\d+)\.',
+            # embeds.0.unembed.N.* (unembedding layers)
+            r'embeds\.\d+\.unembed\.(\d+)\.',
+            # embeds.0.ln_final.N.* (final layer norms)
+            r'embeds\.\d+\.ln_final\.(\d+)\.',
+            # ae_local_blocks.N.* (autoencoder local blocks)
+            r'ae_local_blocks\.(\d+)\.',
+            # ae_global_blocks.N.* (autoencoder global blocks)
+            r'ae_global_blocks\.(\d+)\.',
+            # ae_adapter.N.* (autoencoder adapter blocks)
+            r'ae_adapter\.(\d+)\.',
+            # target_token_engines.0.tte.N.* (target token engine blocks)
+            r'target_token_engines\.\d+\.tte\.(\d+)\.',
+            # target_token_engines.0.tte.N.block.M.* (nested blocks)
+            r'target_token_engines\.\d+\.tte\.(\d+)\.block\.(\d+)\.',
+            # pred_heads.0.pred_heads.0.N.* (prediction head layers)
+            r'pred_heads\.\d+\.pred_heads\.\d+\.(\d+)\.',
+            # Generic patterns for any numbered layers
+            r'layer[s]?\.(\d+)',
+            r'h\.(\d+)', 
+            r'transformer\.(\d+)',
+            r'blocks\.(\d+)',
+        ]
+        
+        for pattern in patterns:
+            match = re.search(pattern, param_name_lower)
+            if match:
+                # For nested patterns (like tte blocks), combine indices
+                if len(match.groups()) > 1:
+                    # Combine indices: e.g., tte.1.block.2 -> 12 (or 1*10+2)
+                    return int(match.group(1)) * 10 + int(match.group(2))
+                else:
+                    return int(match.group(1))
+        
+        # Special handling for components without clear depth
+        if param_name_lower.startswith('embed_target_coords.'):
+            return 0  # Coordinate embeddings at the start
+        elif 'total_grad_norm' in param_name_lower:
+            return -2  # Special marker for total norm
+        elif any(x in param_name_lower for x in ['weathergen', 'stage', 'q_cells']):
+            return -3  # Special marker for metadata
+        
+        return -1  # Unknown depth
+    
+    def plot_total_gradient_norms(self, figsize=(12, 6)):
+        """Plot total gradient norm over training steps."""
+        # Calculate total norm per step
+        total_norms = []
+        steps = []
+        
+        for ith, entry in enumerate(self.data):
+            # step = entry.get('num_samples', entry.get('epoch', 0))
+            step = ith * 5
+            
+            if 'gradients' in entry:
+                grad_data = entry['gradients']
+            elif 'grad_norms' in entry:
+                grad_data = entry['grad_norms']
+            else:
+                grad_data = {k: v for k, v in entry.items() 
+                             if 'q_cells' in k or '0' in k}
+
+            if len(grad_data) == 0:
+                continue
+            
+            # Calculate total norm (L2 norm of all gradients)
+            total_norm = np.sqrt(sum(float(v)**2 for v in grad_data.values()))
+            total_norms.append(total_norm)
+            steps.append(step)
+        
+        plt.figure(figsize=figsize)
+        plt.plot(steps, total_norms, linewidth=1.5, alpha=0.8)
+        plt.xlabel('Training Step')
+        plt.ylabel('Total Gradient Norm')
+        plt.title('Total Gradient Norm vs Training Steps')
+        plt.yscale('log')
+        plt.grid(True, alpha=0.3)
+        plt.tight_layout()
+        plt.savefig("plots/total_grad_norm.png")
+        
+        return steps, total_norms
+    
+    def plot_layer_type_norms(self, figsize=(14, 8)):
+        """Plot gradient norms grouped by layer type."""
+        if self.df is None:
+            print("No DataFrame available. Load data first.")
+            return
+        
+        plt.figure(figsize=figsize)
+        
+        # Get unique layer types
+        layer_types = self.df['layer_type'].unique()
+        print(layer_types)
+        colors = plt.cm.tab10(np.linspace(0, 1, len(layer_types)))
+        
+        for i, layer_type in enumerate(layer_types):
+            layer_data = self.df[self.df['layer_type'] == layer_type]
+            
+            # Calculate mean gradient norm per step for this layer type
+            mean_norms = layer_data.groupby('num_samples')['grad_norm'].mean()
+            
+            plt.plot(mean_norms.index, mean_norms.values, 
+                    label=layer_type, color=colors[i], alpha=0.8)
+        
+        plt.xlabel('Training Step')
+        plt.ylabel('Mean Gradient Norm')
+        plt.title('Gradient Norms by Layer Type')
+        plt.yscale('log')
+        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
+        plt.grid(True, alpha=0.3)
+        plt.tight_layout()
+        plt.savefig("plots/grad_norm_by_layer_type.png")
+    
+    def plot_layer_depth_analysis(self, figsize=(12, 8)):
+        """Plot gradient norms by layer depth."""
+        if self.df is None:
+            print("No DataFrame available. Load data first.")
+            return
+        
+        # Filter out unknown depths
+        depth_data = self.df[self.df['layer_depth'] >= 0]
+        
+        if len(depth_data) == 0:
+            print("No layer depth information found in parameter names.")
+            return
+        
+        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize)
+        
+        # Plot 1: Mean gradient norm by depth over time
+        depths = sorted(depth_data['layer_depth'].unique())
+        colors = plt.cm.viridis(np.linspace(0, 1, len(depths)))
+        
+        for i, depth in enumerate(depths):
+            layer_data = depth_data[depth_data['layer_depth'] == depth]
+            mean_norms = layer_data.groupby('num_samples')['grad_norm'].mean()
+            
+            ax1.plot(mean_norms.index, mean_norms.values, 
+                    label=f'Layer {depth}', color=colors[i], alpha=0.8)
+        
+        ax1.set_xlabel('Training Step')
+        ax1.set_ylabel('Mean Gradient Norm')
+        ax1.set_title('Gradient Norms by Layer Depth')
+        ax1.set_yscale('log')
+        ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
+        ax1.grid(True, alpha=0.3)
+        
+        # Plot 2: Heatmap of gradient norms by depth and step
+        pivot_data = depth_data.groupby(['num_samples', 'layer_depth'])['grad_norm'].mean().unstack()
+        
+        # Sample data if too many steps for readability
+        if len(pivot_data) > 100:
+            sample_idx = np.linspace(0, len(pivot_data)-1, 100, dtype=int)
+            pivot_data = pivot_data.iloc[sample_idx]
+        
+        im = ax2.imshow(pivot_data.T, aspect='auto', cmap='viridis', 
+                       extent=[pivot_data.index.min(), pivot_data.index.max(),
+                              pivot_data.columns.min(), pivot_data.columns.max()])
+        ax2.set_xlabel('Training Step')
+        ax2.set_ylabel('Layer Depth')
+        ax2.set_title('Gradient Norm Heatmap (Layer Depth vs Step)')
+        
+        cbar = plt.colorbar(im, ax=ax2)
+        cbar.set_label('Gradient Norm')
+        
+        plt.tight_layout()
+        plt.savefig("plots/grad_norm_heatmap.png")
+    
+    def plot_gradient_distribution(self, figsize=(15, 10)):
+        """Plot distribution of gradient norms."""
+        if self.df is None:
+            print("No DataFrame available. Load data first.")
+            return
+        
+        fig, axes = plt.subplots(2, 2, figsize=figsize)
+        
+        # Plot 1: Histogram of all gradient norms
+        axes[0, 0].hist(np.log10(self.df['grad_norm'].values), bins=50, alpha=0.7)
+        axes[0, 0].set_xlabel('Log10(Gradient Norm)')
+        axes[0, 0].set_ylabel('Frequency')
+        axes[0, 0].set_title('Distribution of Gradient Norms (Log Scale)')
+        axes[0, 0].grid(True, alpha=0.3)
+        
+        # Plot 2: Box plot by layer type
+        layer_types = self.df['layer_type'].unique()[:10]  # Limit to 10 for readability
+        plot_data = [np.log10(self.df[self.df['layer_type'] == lt]['grad_norm'].values) 
+                    for lt in layer_types]
+        
+        axes[0, 1].boxplot(plot_data, labels=layer_types)
+        axes[0, 1].set_xlabel('Layer Type')
+        axes[0, 1].set_ylabel('Log10(Gradient Norm)')
+        axes[0, 1].set_title('Gradient Norm Distribution by Layer Type')
+        axes[0, 1].tick_params(axis='x', rotation=45)
+        axes[0, 1].grid(True, alpha=0.3)
+        
+        # Plot 3: Gradient norms over time (sample of parameters)
+        sample_params = self.df['parameter'].unique()[:20]  # Sample 20 parameters
+        for param in sample_params:
+            param_data = self.df[self.df['parameter'] == param]
+            axes[1, 0].plot(param_data['num_samples'], param_data['grad_norm'], 
+                          alpha=0.6, linewidth=0.8)
+        
+        axes[1, 0].set_xlabel('Training Step')
+        axes[1, 0].set_ylabel('Gradient Norm')
+        axes[1, 0].set_title('Individual Parameter Gradient Norms (Sample)')
+        axes[1, 0].set_yscale('log')
+        axes[1, 0].grid(True, alpha=0.3)
+        
+        # Plot 4: Statistics over time
+        stats_by_step = self.df.groupby('num_samples')['grad_norm'].agg(['mean', 'std', 'min', 'max'])
+       
+        axes[1, 1].fill_between(stats_by_step.index, 
+                              stats_by_step['mean'] - stats_by_step['std'],
+                              stats_by_step['mean'] + stats_by_step['std'],
+                              alpha=0.3, label='±1 std')
+        axes[1, 1].plot(stats_by_step.index, stats_by_step['mean'], 
+                       label='Mean', linewidth=2)
+        axes[1, 1].plot(stats_by_step.index, stats_by_step['max'], 
+                       label='Max', linewidth=1, alpha=0.8)
+        axes[1, 1].plot(stats_by_step.index, stats_by_step['min'], 
+                       label='Min', linewidth=1, alpha=0.8)
+        
+        axes[1, 1].set_xlabel('Training Step')
+        axes[1, 1].set_ylabel('Gradient Norm')
+        axes[1, 1].set_title('Gradient Norm Statistics Over Time')
+        axes[1, 1].set_yscale('log')
+        axes[1, 1].legend()
+        axes[1, 1].grid(True, alpha=0.3)
+        
+        plt.tight_layout()
+        plt.savefig("plots/grad_norm_over_time.png")
+    
+    def generate_summary_report(self):
+        """Generate a summary report of gradient norm statistics."""
+        if self.df is None:
+            print("No DataFrame available. Load data first.")
+            return
+        
+        print("=== GRADIENT NORMS ANALYSIS REPORT ===")
+        print(f"Total data points: {len(self.df)}")
+        print(f"Training steps: {self.df['num_samples'].nunique()}")
+        print(f"Unique parameters: {self.df['parameter'].nunique()}")
+        print()
+        
+        print("Overall Statistics:")
+        print(f"Mean gradient norm: {self.df['grad_norm'].mean():.6f}")
+        print(f"Median gradient norm: {self.df['grad_norm'].median():.6f}")
+        print(f"Min gradient norm: {self.df['grad_norm'].min():.6f}")
+        print(f"Max gradient norm: {self.df['grad_norm'].max():.6f}")
+        print()
+        
+        print("Statistics by Layer Type:")
+        layer_stats = self.df.groupby('layer_type')['grad_norm'].agg(['count', 'mean', 'std', 'min', 'max'])
+        print(layer_stats)
+        print()
+        
+        # Check for potential issues
+        print("Potential Issues:")
+        very_small = (self.df['grad_norm'] < 1e-6).sum()
+        very_large = (self.df['grad_norm'] > 10.0).sum()
+        
+        if very_small > 0:
+            print(f"⚠️  {very_small} gradient norms < 1e-6 (possible vanishing gradients)")
+        if very_large > 0:
+            print(f"⚠️  {very_large} gradient norms > 10.0 (possible exploding gradients)")
+        
+        if very_small == 0 and very_large == 0:
+            print("✅ No obvious gradient issues detected")
+
+# Usage example
+def analyze_gradient_file(json_file_path):
+    """
+    Main function to analyze gradient norms from a JSON file.
+    
+    Usage:
+    analyze_gradient_file('gradient_norms.jsonl')
+    """
+    
+    analyzer = GradientNormsAnalyzer(json_file_path)
+    
+    # Generate summary report
+    analyzer.generate_summary_report()
+    
+    # Create all plots
+    print("\n=== GENERATING PLOTS ===")
+    
+    print("1. Total gradient norms over time...")
+    analyzer.plot_total_gradient_norms()
+    
+    print("2. Gradient norms by layer type...")
+    analyzer.plot_layer_type_norms()
+    
+    print("3. Layer depth analysis...")
+    analyzer.plot_layer_depth_analysis()
+    
+    print("4. Gradient distribution analysis...")
+    analyzer.plot_gradient_distribution()
+    
+    return analyzer
+
+# Example usage:
+analyzer = analyze_gradient_file('results/yvhxm2jc/yvhxm2jc_train_metrics.json')
diff --git a/uv.lock b/uv.lock
index 51d6a0485..253e7171a 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1614,6 +1614,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e6/eb/3bf6ea8ab7f1503dca3a10df2e4b9c3f6b3316df07f6c0ded94b281c7101/scipy-1.15.3-cp312-cp312-win_amd64.whl", hash = "sha256:52092bc0472cfd17df49ff17e70624345efece4e1a12b23783a1ac59a1b728ed", size = 40966184, upload-time = "2025-05-08T16:06:52.623Z" },
 ]
 
+[[package]]
+name = "seaborn"
+version = "0.13.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "matplotlib" },
+    { name = "numpy" },
+    { name = "pandas" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/86/59/a451d7420a77ab0b98f7affa3a1d78a313d2f7281a57afb1a34bae8ab412/seaborn-0.13.2.tar.gz", hash = "sha256:93e60a40988f4d65e9f4885df477e2fdaff6b73a9ded434c1ab356dd57eefff7", size = 1457696, upload-time = "2024-01-25T13:21:52.551Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/83/11/00d3c3dfc25ad54e731d91449895a79e4bf2384dc3ac01809010ba88f6d5/seaborn-0.13.2-py3-none-any.whl", hash = "sha256:636f8336facf092165e27924f223d3c62ca560b1f2bb5dff7ab7fad265361987", size = 294914, upload-time = "2024-01-25T13:21:49.598Z" },
+]
+
 [[package]]
 name = "semantic-version"
 version = "2.10.0"
@@ -1897,6 +1911,7 @@ dependencies = [
     { name = "polars" },
     { name = "psutil" },
     { name = "pynvml" },
+    { name = "seaborn" },
     { name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'macosx' and sys_platform != 'win32'" },
     { name = "torch", version = "2.6.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'macosx'" },
     { name = "torch", version = "2.6.0+cu124", source = { registry = "https://download.pytorch.org/whl/cu124" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
@@ -1928,6 +1943,7 @@ requires-dist = [
     { name = "polars", specifier = "~=1.25.2" },
     { name = "psutil" },
     { name = "pynvml" },
+    { name = "seaborn", specifier = ">=0.13.2" },
     { name = "torch", marker = "sys_platform != 'linux' and sys_platform != 'macosx' and sys_platform != 'win32'", specifier = "==2.6.0" },
     { name = "torch", marker = "sys_platform == 'linux' or sys_platform == 'win32'", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cu124" },
     { name = "torch", marker = "sys_platform == 'macosx'", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cpu" },

From d2995b4b6d2b3a7b2ac71c4312eb670f082f1298 Mon Sep 17 00:00:00 2001
From: sophiex <24638638+sophie-xhonneux@users.noreply.github.com>
Date: Thu, 7 Aug 2025 10:17:48 +0000
Subject: [PATCH 03/19] Address review changes + hide behind feature flag

---
 config/default_config.yml               |  1 +
 src/weathergen/train/trainer.py         | 29 +++++++++++++++++++------
 src/weathergen/utils/plot_grad_norms.py |  5 ++++-
 3 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/config/default_config.yml b/config/default_config.yml
index e8f21204a..403b1c20d 100644
--- a/config/default_config.yml
+++ b/config/default_config.yml
@@ -105,6 +105,7 @@ grad_clip: 1.0
 weight_decay: 0.1
 norm_type: "LayerNorm"
 nn_module: "te"
+log_grad_norms: True
 
 start_date: 197901010000
 end_date: 202012310000
diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py
index 4430211ac..b8bf07ea4 100644
--- a/src/weathergen/train/trainer.py
+++ b/src/weathergen/train/trainer.py
@@ -13,6 +13,8 @@
 import time
 from typing import Any
 
+from omegaconf import OmegaConf
+
 import numpy as np
 import torch
 import tqdm
@@ -54,6 +56,10 @@ def init(
     ):
         self.cf = cf
 
+        self.cf = OmegaConf.merge(
+            OmegaConf.create({"log_grad_norms": False}), self.cf
+        )
+
         assert cf.samples_per_epoch % cf.batch_size_per_gpu == 0
         assert cf.samples_per_validation % cf.batch_size_validation_per_gpu == 0
         assert cf.forecast_policy if cf.forecast_steps > 0 else True
@@ -76,7 +82,6 @@ def init(
 
         self.init_perf_monitoring()
         self.train_logger = TrainLogger(cf, config.get_path_run(self.cf))
-        self.last_grad_norm = 0.0
 
     def inference(self, cf, run_id_trained, epoch):
         # general initalization
@@ -459,6 +464,7 @@ def train(self, epoch):
 
         # Unweighted loss, real weighted loss, std for losses that need it
         self.loss_unweighted_hist, self.loss_model_hist, self.stdev_unweighted_hist = [], [], []
+        self.last_grad_norm = 0.0
 
         # training loop
         self.t_start = time.time()
@@ -489,12 +495,7 @@ def train(self, epoch):
 
             # log gradient norms
             if bidx % log_interval == 0:
-                grad_norms = { "total_grad_norm" : total_norm.item() }
-                self.last_grad_norm = total_norm.item()
-                for name, param in self.ddp_model.named_parameters():
-                    if param.grad is not None:
-                        grad_norms["grad_norm_" + name] = param.grad.norm().item()
-                self.train_logger.log_metrics(TRAIN, grad_norms)
+                self._log_instant_grad_norms(TRAIN, total_norm)
 
 
             # optimizer step
@@ -709,6 +710,20 @@ def _log(self, stage: Stage):
 
             self.loss_unweighted_hist, self.loss_model_hist, self.stdev_unweighted_hist = [], [], []
 
+    def _log_instant_grad_norms(self, stage: Stage, total_norm):
+        """
+        Log instantaneous grad norms, we do not average because of the cost and because we want to 
+        measure the actual values
+
+        TODO test DDP case
+        """
+        grad_norms = { "total_grad_norm" : total_norm.item() }
+        self.last_grad_norm = total_norm.item()
+        for name, param in self.ddp_model.named_parameters():
+            if param.grad is not None:
+                grad_norms["grad_norm_" + name] = param.grad.norm().item()
+        self.train_logger.log_metrics(TRAIN, grad_norms)
+
     def _log_terminal(self, bidx: int, epoch: int, stage: Stage):
         if bidx % self.print_freq == 0 and bidx > 0 or stage == VAL:
             # compute from last iteration
diff --git a/src/weathergen/utils/plot_grad_norms.py b/src/weathergen/utils/plot_grad_norms.py
index 8a6ded4ac..0ff1a1f5c 100644
--- a/src/weathergen/utils/plot_grad_norms.py
+++ b/src/weathergen/utils/plot_grad_norms.py
@@ -480,4 +480,7 @@ def analyze_gradient_file(json_file_path):
     return analyzer
 
 # Example usage:
-analyzer = analyze_gradient_file('results/yvhxm2jc/yvhxm2jc_train_metrics.json')
+# uv run python src/weathergen/utils/plot_grad_norms.py results/yvhxm2jc/yvhxm2jc_train_metrics.json
+if __name__ == '__main__':
+    import sys
+    analyzer = analyze_gradient_file(sys.argv[1])

From 26c6869eccfc595173db9f11e94ad3f62b1ad210 Mon Sep 17 00:00:00 2001
From: sophiex <24638638+sophie-xhonneux@users.noreply.github.com>
Date: Thu, 7 Aug 2025 10:49:05 +0000
Subject: [PATCH 04/19] Final fixes including backward compatibility

---
 config/default_config.yml               |  2 +-
 src/weathergen/train/trainer.py         | 13 +++++--------
 src/weathergen/utils/plot_grad_norms.py | 14 +++++++-------
 3 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/config/default_config.yml b/config/default_config.yml
index 403b1c20d..9fa9d359e 100644
--- a/config/default_config.yml
+++ b/config/default_config.yml
@@ -105,7 +105,7 @@ grad_clip: 1.0
 weight_decay: 0.1
 norm_type: "LayerNorm"
 nn_module: "te"
-log_grad_norms: True
+log_grad_norms: False
 
 start_date: 197901010000
 end_date: 202012310000
diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py
index b8bf07ea4..9619c93d2 100644
--- a/src/weathergen/train/trainer.py
+++ b/src/weathergen/train/trainer.py
@@ -56,10 +56,6 @@ def init(
     ):
         self.cf = cf
 
-        self.cf = OmegaConf.merge(
-            OmegaConf.create({"log_grad_norms": False}), self.cf
-        )
-
         assert cf.samples_per_epoch % cf.batch_size_per_gpu == 0
         assert cf.samples_per_validation % cf.batch_size_validation_per_gpu == 0
         assert cf.forecast_policy if cf.forecast_steps > 0 else True
@@ -72,6 +68,8 @@ def init(
         # num_ranks gets overwritten by current setting during init_ddp()
         self.num_ranks_original = cf.get("num_ranks", None)
 
+        self.log_grad_norms = cf.get("log_grad_norms", False)
+
         # TODO remove num_ranks, rank, with_with ddp from config
         self.init_ddp(cf)
 
@@ -494,10 +492,9 @@ def train(self, epoch):
             )
 
             # log gradient norms
-            if bidx % log_interval == 0:
+            if bidx % log_interval == 0 and self.log_grad_norms:
                 self._log_instant_grad_norms(TRAIN, total_norm)
 
-
             # optimizer step
             self.grad_scaler.step(self.optimizer)
             self.grad_scaler.update()
@@ -712,12 +709,12 @@ def _log(self, stage: Stage):
 
     def _log_instant_grad_norms(self, stage: Stage, total_norm):
         """
-        Log instantaneous grad norms, we do not average because of the cost and because we want to 
+        Log instantaneous grad norms, we do not average because of the cost and because we want to
         measure the actual values
 
         TODO test DDP case
         """
-        grad_norms = { "total_grad_norm" : total_norm.item() }
+        grad_norms = {"total_grad_norm": total_norm.item()}
         self.last_grad_norm = total_norm.item()
         for name, param in self.ddp_model.named_parameters():
             if param.grad is not None:
diff --git a/src/weathergen/utils/plot_grad_norms.py b/src/weathergen/utils/plot_grad_norms.py
index 0ff1a1f5c..de50ad8f5 100644
--- a/src/weathergen/utils/plot_grad_norms.py
+++ b/src/weathergen/utils/plot_grad_norms.py
@@ -49,7 +49,7 @@ def create_dataframe(self):
             else:
                 # Assume all keys except step/epoch are gradient data
                 grad_data = {k: v for k, v in entry.items() 
-                           if 'stream' not in k and ('q_cells' in k or '0' in k)}
+                           if 'stream' not in k and ('grad_norm' in k)}
             
             for param_name, norm_value in grad_data.items():
                 rows.append({
@@ -65,7 +65,7 @@ def create_dataframe(self):
     
     def extract_layer_type(self, param_name):
         """Extract layer type from parameter name."""
-        param_name_lower = param_name.lower()
+        param_name_lower = param_name.lower()[10:]
         
         # Handle your specific naming patterns
         if param_name_lower.startswith('embeds.'):
@@ -180,13 +180,13 @@ def extract_layer_depth(self, param_name):
         # Look for patterns specific to your architecture
         patterns = [
             # embeds.0.layers.N.* (transformer layers within embeds)
-            r'embeds\.\d+\.layers\.(\d+)\.',
+            r'grad_norm_embeds\.\d+\.layers\.(\d+)\.',
             # embeds.0.unembed.N.* (unembedding layers)
-            r'embeds\.\d+\.unembed\.(\d+)\.',
+            r'grad_norm_embeds\.\d+\.unembed\.(\d+)\.',
             # embeds.0.ln_final.N.* (final layer norms)
-            r'embeds\.\d+\.ln_final\.(\d+)\.',
+            r'grad_norm_embeds\.\d+\.ln_final\.(\d+)\.',
             # ae_local_blocks.N.* (autoencoder local blocks)
-            r'ae_local_blocks\.(\d+)\.',
+            r'grad_norm_ae_local_blocks\.(\d+)\.',
             # ae_global_blocks.N.* (autoencoder global blocks)
             r'ae_global_blocks\.(\d+)\.',
             # ae_adapter.N.* (autoencoder adapter blocks)
@@ -240,7 +240,7 @@ def plot_total_gradient_norms(self, figsize=(12, 6)):
                 grad_data = entry['grad_norms']
             else:
                 grad_data = {k: v for k, v in entry.items() 
-                             if 'q_cells' in k or '0' in k}
+                             if 'grad_norm' in k}
 
             if len(grad_data) == 0:
                 continue

From 9a66f7217d79a44700fa6d4280ae9b0f2eccc714 Mon Sep 17 00:00:00 2001
From: sophiex <24638638+sophie-xhonneux@users.noreply.github.com>
Date: Thu, 7 Aug 2025 10:51:40 +0000
Subject: [PATCH 05/19] Ruff

---
 src/weathergen/train/trainer.py      | 2 --
 src/weathergen/utils/train_logger.py | 1 -
 2 files changed, 3 deletions(-)

diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py
index 41a9aab68..b65987484 100644
--- a/src/weathergen/train/trainer.py
+++ b/src/weathergen/train/trainer.py
@@ -13,8 +13,6 @@
 import time
 from typing import Any
 
-from omegaconf import OmegaConf
-
 import numpy as np
 import torch
 import tqdm
diff --git a/src/weathergen/utils/train_logger.py b/src/weathergen/utils/train_logger.py
index b6840df31..f60e748f7 100644
--- a/src/weathergen/utils/train_logger.py
+++ b/src/weathergen/utils/train_logger.py
@@ -149,7 +149,6 @@ def add_train(
         if perf_mem > 0.0:
             metrics[_performance_memory] = perf_mem
 
-
         self.log_metrics("train", metrics)
         with open(self.path_run / (self.cf.run_id + "_perf_log.txt"), "ab") as f:
             np.savetxt(f, log_vals)

From 22a6fd72d9903dfd9b804d463a93c9d06df782f8 Mon Sep 17 00:00:00 2001
From: sophiex <24638638+sophie-xhonneux@users.noreply.github.com>
Date: Thu, 7 Aug 2025 12:01:38 +0000
Subject: [PATCH 06/19] More ruff stuff

---
 src/weathergen/utils/plot_grad_norms.py | 593 +++++++++++++-----------
 1 file changed, 316 insertions(+), 277 deletions(-)

diff --git a/src/weathergen/utils/plot_grad_norms.py b/src/weathergen/utils/plot_grad_norms.py
index de50ad8f5..ec310c0fc 100644
--- a/src/weathergen/utils/plot_grad_norms.py
+++ b/src/weathergen/utils/plot_grad_norms.py
@@ -1,11 +1,13 @@
 import json
+import re
+from pathlib import Path
+
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-from pathlib import Path
-import seaborn as sns
-from collections import defaultdict
-import re
+
+# ruff: noqa: T201
+
 
 class GradientNormsAnalyzer:
     def __init__(self, json_file_path):
@@ -17,193 +19,202 @@ def __init__(self, json_file_path):
         self.data = []
         self.df = None
         self.load_data()
-        
+
     def load_data(self):
         """Load and parse the JSON data from file."""
         print(f"Loading data from {self.json_file_path}...")
-        
-        with open(self.json_file_path, 'r') as f:
+
+        with open(self.json_file_path) as f:
             for line_num, line in enumerate(f, 1):
                 try:
                     data_point = json.loads(line.strip())
                     self.data.append(data_point)
                 except json.JSONDecodeError as e:
                     print(f"Warning: Could not parse line {line_num}: {e}")
-                    
+
         print(f"Loaded {len(self.data)} data points")
         self.create_dataframe()
-    
+
     def create_dataframe(self):
         """Convert loaded data into a pandas DataFrame for easier analysis."""
         rows = []
-        
+
         for ith, entry in enumerate(self.data):
             # step = entry.get('num_samples', entry.get('epoch', 0))
             step = ith * 5
-            
+
             # Handle different possible data structures
-            if 'gradients' in entry:
-                grad_data = entry['gradients']
-            elif 'grad_norms' in entry:
-                grad_data = entry['grad_norms']
+            if "gradients" in entry:
+                grad_data = entry["gradients"]
+            elif "grad_norms" in entry:
+                grad_data = entry["grad_norms"]
             else:
                 # Assume all keys except step/epoch are gradient data
-                grad_data = {k: v for k, v in entry.items() 
-                           if 'stream' not in k and ('grad_norm' in k)}
-            
+                grad_data = {
+                    k: v for k, v in entry.items() if "stream" not in k and ("grad_norm" in k)
+                }
+
             for param_name, norm_value in grad_data.items():
-                rows.append({
-                    'num_samples': step,
-                    'parameter': param_name,
-                    'grad_norm': float(norm_value),
-                    'layer_type': self.extract_layer_type(param_name),
-                    'layer_depth': self.extract_layer_depth(param_name)
-                })
-       
+                rows.append(
+                    {
+                        "num_samples": step,
+                        "parameter": param_name,
+                        "grad_norm": float(norm_value),
+                        "layer_type": self.extract_layer_type(param_name),
+                        "layer_depth": self.extract_layer_depth(param_name),
+                    }
+                )
+
         self.df = pd.DataFrame(rows)
         print(f"Created DataFrame with {len(self.df)} gradient norm records")
-    
+
     def extract_layer_type(self, param_name):
         """Extract layer type from parameter name."""
         param_name_lower = param_name.lower()[10:]
-        
+
         # Handle your specific naming patterns
-        if param_name_lower.startswith('embeds.'):
-            if '.embed.' in param_name_lower:
-                return 'embedding'
-            elif '.unembed.' in param_name_lower:
-                return 'unembedding'
-            elif '.ln_final.' in param_name_lower:
-                return 'layer_norm_final'
-            elif 'proj_heads_q' in param_name_lower:
-                return 'attention_q'
-            elif 'proj_heads_k' in param_name_lower:
-                return 'attention_k'
-            elif 'proj_heads_v' in param_name_lower:
-                return 'attention_v'
-            elif 'proj_out' in param_name_lower:
-                return 'attention_out'
-            elif '.layers.' in param_name_lower and ('weight' in param_name_lower or 'bias' in param_name_lower):
-                return 'ffn'
+        if param_name_lower.startswith("embeds."):
+            if ".embed." in param_name_lower:
+                return "embedding"
+            elif ".unembed." in param_name_lower:
+                return "unembedding"
+            elif ".ln_final." in param_name_lower:
+                return "layer_norm_final"
+            elif "proj_heads_q" in param_name_lower:
+                return "attention_q"
+            elif "proj_heads_k" in param_name_lower:
+                return "attention_k"
+            elif "proj_heads_v" in param_name_lower:
+                return "attention_v"
+            elif "proj_out" in param_name_lower:
+                return "attention_out"
+            elif ".layers." in param_name_lower and (
+                "weight" in param_name_lower or "bias" in param_name_lower
+            ):
+                return "ffn"
             else:
-                return 'embeds_other'
-        
-        elif param_name_lower.startswith('ae_local_blocks.'):
-            if 'proj_heads_q' in param_name_lower:
-                return 'ae_local_attention_q'
-            elif 'proj_heads_k' in param_name_lower:
-                return 'ae_local_attention_k'
-            elif 'proj_heads_v' in param_name_lower:
-                return 'ae_local_attention_v'
-            elif 'proj_out' in param_name_lower:
-                return 'ae_local_attention_out'
-            elif '.layers.' in param_name_lower:
-                return 'ae_local_ffn'
+                return "embeds_other"
+
+        elif param_name_lower.startswith("ae_local_blocks."):
+            if "proj_heads_q" in param_name_lower:
+                return "ae_local_attention_q"
+            elif "proj_heads_k" in param_name_lower:
+                return "ae_local_attention_k"
+            elif "proj_heads_v" in param_name_lower:
+                return "ae_local_attention_v"
+            elif "proj_out" in param_name_lower:
+                return "ae_local_attention_out"
+            elif ".layers." in param_name_lower:
+                return "ae_local_ffn"
             else:
-                return 'ae_local_other'
-        
-        elif param_name_lower.startswith('ae_global_blocks.'):
-            if 'proj_heads_q' in param_name_lower:
-                return 'ae_global_attention_q'
-            elif 'proj_heads_k' in param_name_lower:
-                return 'ae_global_attention_k'
-            elif 'proj_heads_v' in param_name_lower:
-                return 'ae_global_attention_v'
-            elif 'proj_out' in param_name_lower:
-                return 'ae_global_attention_out'
-            elif '.layers.' in param_name_lower:
-                return 'ae_global_ffn'
+                return "ae_local_other"
+
+        elif param_name_lower.startswith("ae_global_blocks."):
+            if "proj_heads_q" in param_name_lower:
+                return "ae_global_attention_q"
+            elif "proj_heads_k" in param_name_lower:
+                return "ae_global_attention_k"
+            elif "proj_heads_v" in param_name_lower:
+                return "ae_global_attention_v"
+            elif "proj_out" in param_name_lower:
+                return "ae_global_attention_out"
+            elif ".layers." in param_name_lower:
+                return "ae_global_ffn"
             else:
-                return 'ae_global_other'
-        
-        elif param_name_lower.startswith('ae_adapter.'):
-            if 'proj_heads_q' in param_name_lower:
-                return 'ae_adapter_attention_q'
-            elif 'proj_heads_k' in param_name_lower:
-                return 'ae_adapter_attention_k'
-            elif 'proj_heads_v' in param_name_lower:
-                return 'ae_adapter_attention_v'
-            elif 'proj_out' in param_name_lower:
-                return 'ae_adapter_attention_out'
-            elif '.layers.' in param_name_lower:
-                return 'ae_adapter_ffn'
+                return "ae_global_other"
+
+        elif param_name_lower.startswith("ae_adapter."):
+            if "proj_heads_q" in param_name_lower:
+                return "ae_adapter_attention_q"
+            elif "proj_heads_k" in param_name_lower:
+                return "ae_adapter_attention_k"
+            elif "proj_heads_v" in param_name_lower:
+                return "ae_adapter_attention_v"
+            elif "proj_out" in param_name_lower:
+                return "ae_adapter_attention_out"
+            elif ".layers." in param_name_lower:
+                return "ae_adapter_ffn"
             else:
-                return 'ae_adapter_other'
-        
-        elif param_name_lower.startswith('target_token_engines.'):
-            if 'proj_heads_q' in param_name_lower:
-                return 'tte_attention_q'
-            elif 'proj_heads_k' in param_name_lower:
-                return 'tte_attention_k'
-            elif 'proj_heads_v' in param_name_lower:
-                return 'tte_attention_v'
-            elif 'proj_out' in param_name_lower:
-                return 'tte_attention_out'
-            elif 'embed_aux' in param_name_lower:
-                return 'tte_embed_aux'
-            elif 'lnorm' in param_name_lower:
-                return 'tte_layer_norm'
-            elif '.layers.' in param_name_lower:
-                return 'tte_ffn'
+                return "ae_adapter_other"
+
+        elif param_name_lower.startswith("target_token_engines."):
+            if "proj_heads_q" in param_name_lower:
+                return "tte_attention_q"
+            elif "proj_heads_k" in param_name_lower:
+                return "tte_attention_k"
+            elif "proj_heads_v" in param_name_lower:
+                return "tte_attention_v"
+            elif "proj_out" in param_name_lower:
+                return "tte_attention_out"
+            elif "embed_aux" in param_name_lower:
+                return "tte_embed_aux"
+            elif "lnorm" in param_name_lower:
+                return "tte_layer_norm"
+            elif ".layers." in param_name_lower:
+                return "tte_ffn"
             else:
-                return 'tte_other'
-        
-        elif param_name_lower.startswith('embed_target_coords.'):
-            return 'target_coords_embedding'
-        
-        elif param_name_lower.startswith('pred_heads.'):
-            return 'prediction_head'
-        
+                return "tte_other"
+
+        elif param_name_lower.startswith("embed_target_coords."):
+            return "target_coords_embedding"
+
+        elif param_name_lower.startswith("pred_heads."):
+            return "prediction_head"
+
         # Fallback for standard patterns (if any)
-        elif 'embed' in param_name_lower:
-            return 'embedding'
-        elif 'attention' in param_name_lower or 'attn' in param_name_lower:
-            if 'q_proj' in param_name_lower or 'query' in param_name_lower:
-                return 'attention_q'
-            elif 'k_proj' in param_name_lower or 'key' in param_name_lower:
-                return 'attention_k'
-            elif 'v_proj' in param_name_lower or 'value' in param_name_lower:
-                return 'attention_v'
-            elif 'o_proj' in param_name_lower or 'out' in param_name_lower:
-                return 'attention_out'
+        elif "embed" in param_name_lower:
+            return "embedding"
+        elif "attention" in param_name_lower or "attn" in param_name_lower:
+            if "q_proj" in param_name_lower or "query" in param_name_lower:
+                return "attention_q"
+            elif "k_proj" in param_name_lower or "key" in param_name_lower:
+                return "attention_k"
+            elif "v_proj" in param_name_lower or "value" in param_name_lower:
+                return "attention_v"
+            elif "o_proj" in param_name_lower or "out" in param_name_lower:
+                return "attention_out"
             else:
-                return 'attention'
-        elif 'layernorm' in param_name_lower or 'layer_norm' in param_name_lower or 'ln' in param_name_lower:
-            return 'layernorm'
+                return "attention"
+        elif (
+            "layernorm" in param_name_lower
+            or "layer_norm" in param_name_lower
+            or "ln" in param_name_lower
+        ):
+            return "layernorm"
         else:
-            return 'other'
-    
+            return "other"
+
     def extract_layer_depth(self, param_name):
         """Extract layer depth/index from parameter name."""
         param_name_lower = param_name.lower()
-        
+
         # Look for patterns specific to your architecture
         patterns = [
             # embeds.0.layers.N.* (transformer layers within embeds)
-            r'grad_norm_embeds\.\d+\.layers\.(\d+)\.',
+            r"grad_norm_embeds\.\d+\.layers\.(\d+)\.",
             # embeds.0.unembed.N.* (unembedding layers)
-            r'grad_norm_embeds\.\d+\.unembed\.(\d+)\.',
+            r"grad_norm_embeds\.\d+\.unembed\.(\d+)\.",
             # embeds.0.ln_final.N.* (final layer norms)
-            r'grad_norm_embeds\.\d+\.ln_final\.(\d+)\.',
+            r"grad_norm_embeds\.\d+\.ln_final\.(\d+)\.",
             # ae_local_blocks.N.* (autoencoder local blocks)
-            r'grad_norm_ae_local_blocks\.(\d+)\.',
+            r"grad_norm_ae_local_blocks\.(\d+)\.",
             # ae_global_blocks.N.* (autoencoder global blocks)
-            r'ae_global_blocks\.(\d+)\.',
+            r"ae_global_blocks\.(\d+)\.",
             # ae_adapter.N.* (autoencoder adapter blocks)
-            r'ae_adapter\.(\d+)\.',
+            r"ae_adapter\.(\d+)\.",
             # target_token_engines.0.tte.N.* (target token engine blocks)
-            r'target_token_engines\.\d+\.tte\.(\d+)\.',
+            r"target_token_engines\.\d+\.tte\.(\d+)\.",
             # target_token_engines.0.tte.N.block.M.* (nested blocks)
-            r'target_token_engines\.\d+\.tte\.(\d+)\.block\.(\d+)\.',
+            r"target_token_engines\.\d+\.tte\.(\d+)\.block\.(\d+)\.",
             # pred_heads.0.pred_heads.0.N.* (prediction head layers)
-            r'pred_heads\.\d+\.pred_heads\.\d+\.(\d+)\.',
+            r"pred_heads\.\d+\.pred_heads\.\d+\.(\d+)\.",
             # Generic patterns for any numbered layers
-            r'layer[s]?\.(\d+)',
-            r'h\.(\d+)', 
-            r'transformer\.(\d+)',
-            r'blocks\.(\d+)',
+            r"layer[s]?\.(\d+)",
+            r"h\.(\d+)",
+            r"transformer\.(\d+)",
+            r"blocks\.(\d+)",
         ]
-        
+
         for pattern in patterns:
             match = re.search(pattern, param_name_lower)
             if match:
@@ -213,274 +224,302 @@ def extract_layer_depth(self, param_name):
                     return int(match.group(1)) * 10 + int(match.group(2))
                 else:
                     return int(match.group(1))
-        
+
         # Special handling for components without clear depth
-        if param_name_lower.startswith('embed_target_coords.'):
+        if param_name_lower.startswith("embed_target_coords."):
             return 0  # Coordinate embeddings at the start
-        elif 'total_grad_norm' in param_name_lower:
+        elif "total_grad_norm" in param_name_lower:
             return -2  # Special marker for total norm
-        elif any(x in param_name_lower for x in ['weathergen', 'stage', 'q_cells']):
+        elif any(x in param_name_lower for x in ["weathergen", "stage", "q_cells"]):
             return -3  # Special marker for metadata
-        
+
         return -1  # Unknown depth
-    
+
     def plot_total_gradient_norms(self, figsize=(12, 6)):
         """Plot total gradient norm over training steps."""
         # Calculate total norm per step
         total_norms = []
         steps = []
-        
+
         for ith, entry in enumerate(self.data):
             # step = entry.get('num_samples', entry.get('epoch', 0))
             step = ith * 5
-            
-            if 'gradients' in entry:
-                grad_data = entry['gradients']
-            elif 'grad_norms' in entry:
-                grad_data = entry['grad_norms']
+
+            if "gradients" in entry:
+                grad_data = entry["gradients"]
+            elif "grad_norms" in entry:
+                grad_data = entry["grad_norms"]
             else:
-                grad_data = {k: v for k, v in entry.items() 
-                             if 'grad_norm' in k}
+                grad_data = {k: v for k, v in entry.items() if "grad_norm" in k}
 
             if len(grad_data) == 0:
                 continue
-            
+
             # Calculate total norm (L2 norm of all gradients)
-            total_norm = np.sqrt(sum(float(v)**2 for v in grad_data.values()))
+            total_norm = np.sqrt(sum(float(v) ** 2 for v in grad_data.values()))
             total_norms.append(total_norm)
             steps.append(step)
-        
+
         plt.figure(figsize=figsize)
         plt.plot(steps, total_norms, linewidth=1.5, alpha=0.8)
-        plt.xlabel('Training Step')
-        plt.ylabel('Total Gradient Norm')
-        plt.title('Total Gradient Norm vs Training Steps')
-        plt.yscale('log')
+        plt.xlabel("Training Step")
+        plt.ylabel("Total Gradient Norm")
+        plt.title("Total Gradient Norm vs Training Steps")
+        plt.yscale("log")
         plt.grid(True, alpha=0.3)
         plt.tight_layout()
         plt.savefig("plots/total_grad_norm.png")
-        
+
         return steps, total_norms
-    
+
     def plot_layer_type_norms(self, figsize=(14, 8)):
         """Plot gradient norms grouped by layer type."""
         if self.df is None:
             print("No DataFrame available. Load data first.")
             return
-        
+
         plt.figure(figsize=figsize)
-        
+
         # Get unique layer types
-        layer_types = self.df['layer_type'].unique()
+        layer_types = self.df["layer_type"].unique()
         print(layer_types)
         colors = plt.cm.tab10(np.linspace(0, 1, len(layer_types)))
-        
+
         for i, layer_type in enumerate(layer_types):
-            layer_data = self.df[self.df['layer_type'] == layer_type]
-            
+            layer_data = self.df[self.df["layer_type"] == layer_type]
+
             # Calculate mean gradient norm per step for this layer type
-            mean_norms = layer_data.groupby('num_samples')['grad_norm'].mean()
-            
-            plt.plot(mean_norms.index, mean_norms.values, 
-                    label=layer_type, color=colors[i], alpha=0.8)
-        
-        plt.xlabel('Training Step')
-        plt.ylabel('Mean Gradient Norm')
-        plt.title('Gradient Norms by Layer Type')
-        plt.yscale('log')
-        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
+            mean_norms = layer_data.groupby("num_samples")["grad_norm"].mean()
+
+            plt.plot(
+                mean_norms.index, mean_norms.values, label=layer_type, color=colors[i], alpha=0.8
+            )
+
+        plt.xlabel("Training Step")
+        plt.ylabel("Mean Gradient Norm")
+        plt.title("Gradient Norms by Layer Type")
+        plt.yscale("log")
+        plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
         plt.grid(True, alpha=0.3)
         plt.tight_layout()
         plt.savefig("plots/grad_norm_by_layer_type.png")
-    
+
     def plot_layer_depth_analysis(self, figsize=(12, 8)):
         """Plot gradient norms by layer depth."""
         if self.df is None:
             print("No DataFrame available. Load data first.")
             return
-        
+
         # Filter out unknown depths
-        depth_data = self.df[self.df['layer_depth'] >= 0]
-        
+        depth_data = self.df[self.df["layer_depth"] >= 0]
+
         if len(depth_data) == 0:
             print("No layer depth information found in parameter names.")
             return
-        
+
         fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize)
-        
+
         # Plot 1: Mean gradient norm by depth over time
-        depths = sorted(depth_data['layer_depth'].unique())
+        depths = sorted(depth_data["layer_depth"].unique())
         colors = plt.cm.viridis(np.linspace(0, 1, len(depths)))
-        
+
         for i, depth in enumerate(depths):
-            layer_data = depth_data[depth_data['layer_depth'] == depth]
-            mean_norms = layer_data.groupby('num_samples')['grad_norm'].mean()
-            
-            ax1.plot(mean_norms.index, mean_norms.values, 
-                    label=f'Layer {depth}', color=colors[i], alpha=0.8)
-        
-        ax1.set_xlabel('Training Step')
-        ax1.set_ylabel('Mean Gradient Norm')
-        ax1.set_title('Gradient Norms by Layer Depth')
-        ax1.set_yscale('log')
-        ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
+            layer_data = depth_data[depth_data["layer_depth"] == depth]
+            mean_norms = layer_data.groupby("num_samples")["grad_norm"].mean()
+
+            ax1.plot(
+                mean_norms.index,
+                mean_norms.values,
+                label=f"Layer {depth}",
+                color=colors[i],
+                alpha=0.8,
+            )
+
+        ax1.set_xlabel("Training Step")
+        ax1.set_ylabel("Mean Gradient Norm")
+        ax1.set_title("Gradient Norms by Layer Depth")
+        ax1.set_yscale("log")
+        ax1.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
         ax1.grid(True, alpha=0.3)
-        
+
         # Plot 2: Heatmap of gradient norms by depth and step
-        pivot_data = depth_data.groupby(['num_samples', 'layer_depth'])['grad_norm'].mean().unstack()
-        
+        pivot_data = (
+            depth_data.groupby(["num_samples", "layer_depth"])["grad_norm"].mean().unstack()
+        )
+
         # Sample data if too many steps for readability
         if len(pivot_data) > 100:
-            sample_idx = np.linspace(0, len(pivot_data)-1, 100, dtype=int)
+            sample_idx = np.linspace(0, len(pivot_data) - 1, 100, dtype=int)
             pivot_data = pivot_data.iloc[sample_idx]
-        
-        im = ax2.imshow(pivot_data.T, aspect='auto', cmap='viridis', 
-                       extent=[pivot_data.index.min(), pivot_data.index.max(),
-                              pivot_data.columns.min(), pivot_data.columns.max()])
-        ax2.set_xlabel('Training Step')
-        ax2.set_ylabel('Layer Depth')
-        ax2.set_title('Gradient Norm Heatmap (Layer Depth vs Step)')
-        
+
+        im = ax2.imshow(
+            pivot_data.T,
+            aspect="auto",
+            cmap="viridis",
+            extent=[
+                pivot_data.index.min(),
+                pivot_data.index.max(),
+                pivot_data.columns.min(),
+                pivot_data.columns.max(),
+            ],
+        )
+        ax2.set_xlabel("Training Step")
+        ax2.set_ylabel("Layer Depth")
+        ax2.set_title("Gradient Norm Heatmap (Layer Depth vs Step)")
+
         cbar = plt.colorbar(im, ax=ax2)
-        cbar.set_label('Gradient Norm')
-        
+        cbar.set_label("Gradient Norm")
+
         plt.tight_layout()
         plt.savefig("plots/grad_norm_heatmap.png")
-    
+
     def plot_gradient_distribution(self, figsize=(15, 10)):
         """Plot distribution of gradient norms."""
         if self.df is None:
             print("No DataFrame available. Load data first.")
             return
-        
+
         fig, axes = plt.subplots(2, 2, figsize=figsize)
-        
+
         # Plot 1: Histogram of all gradient norms
-        axes[0, 0].hist(np.log10(self.df['grad_norm'].values), bins=50, alpha=0.7)
-        axes[0, 0].set_xlabel('Log10(Gradient Norm)')
-        axes[0, 0].set_ylabel('Frequency')
-        axes[0, 0].set_title('Distribution of Gradient Norms (Log Scale)')
+        axes[0, 0].hist(np.log10(self.df["grad_norm"].values), bins=50, alpha=0.7)
+        axes[0, 0].set_xlabel("Log10(Gradient Norm)")
+        axes[0, 0].set_ylabel("Frequency")
+        axes[0, 0].set_title("Distribution of Gradient Norms (Log Scale)")
         axes[0, 0].grid(True, alpha=0.3)
-        
+
         # Plot 2: Box plot by layer type
-        layer_types = self.df['layer_type'].unique()[:10]  # Limit to 10 for readability
-        plot_data = [np.log10(self.df[self.df['layer_type'] == lt]['grad_norm'].values) 
-                    for lt in layer_types]
-        
+        layer_types = self.df["layer_type"].unique()[:10]  # Limit to 10 for readability
+        plot_data = [
+            np.log10(self.df[self.df["layer_type"] == lt]["grad_norm"].values) for lt in layer_types
+        ]
+
         axes[0, 1].boxplot(plot_data, labels=layer_types)
-        axes[0, 1].set_xlabel('Layer Type')
-        axes[0, 1].set_ylabel('Log10(Gradient Norm)')
-        axes[0, 1].set_title('Gradient Norm Distribution by Layer Type')
-        axes[0, 1].tick_params(axis='x', rotation=45)
+        axes[0, 1].set_xlabel("Layer Type")
+        axes[0, 1].set_ylabel("Log10(Gradient Norm)")
+        axes[0, 1].set_title("Gradient Norm Distribution by Layer Type")
+        axes[0, 1].tick_params(axis="x", rotation=45)
         axes[0, 1].grid(True, alpha=0.3)
-        
+
         # Plot 3: Gradient norms over time (sample of parameters)
-        sample_params = self.df['parameter'].unique()[:20]  # Sample 20 parameters
+        sample_params = self.df["parameter"].unique()[:20]  # Sample 20 parameters
         for param in sample_params:
-            param_data = self.df[self.df['parameter'] == param]
-            axes[1, 0].plot(param_data['num_samples'], param_data['grad_norm'], 
-                          alpha=0.6, linewidth=0.8)
-        
-        axes[1, 0].set_xlabel('Training Step')
-        axes[1, 0].set_ylabel('Gradient Norm')
-        axes[1, 0].set_title('Individual Parameter Gradient Norms (Sample)')
-        axes[1, 0].set_yscale('log')
+            param_data = self.df[self.df["parameter"] == param]
+            axes[1, 0].plot(
+                param_data["num_samples"], param_data["grad_norm"], alpha=0.6, linewidth=0.8
+            )
+
+        axes[1, 0].set_xlabel("Training Step")
+        axes[1, 0].set_ylabel("Gradient Norm")
+        axes[1, 0].set_title("Individual Parameter Gradient Norms (Sample)")
+        axes[1, 0].set_yscale("log")
         axes[1, 0].grid(True, alpha=0.3)
-        
+
         # Plot 4: Statistics over time
-        stats_by_step = self.df.groupby('num_samples')['grad_norm'].agg(['mean', 'std', 'min', 'max'])
-       
-        axes[1, 1].fill_between(stats_by_step.index, 
-                              stats_by_step['mean'] - stats_by_step['std'],
-                              stats_by_step['mean'] + stats_by_step['std'],
-                              alpha=0.3, label='±1 std')
-        axes[1, 1].plot(stats_by_step.index, stats_by_step['mean'], 
-                       label='Mean', linewidth=2)
-        axes[1, 1].plot(stats_by_step.index, stats_by_step['max'], 
-                       label='Max', linewidth=1, alpha=0.8)
-        axes[1, 1].plot(stats_by_step.index, stats_by_step['min'], 
-                       label='Min', linewidth=1, alpha=0.8)
-        
-        axes[1, 1].set_xlabel('Training Step')
-        axes[1, 1].set_ylabel('Gradient Norm')
-        axes[1, 1].set_title('Gradient Norm Statistics Over Time')
-        axes[1, 1].set_yscale('log')
+        stats_by_step = self.df.groupby("num_samples")["grad_norm"].agg(
+            ["mean", "std", "min", "max"]
+        )
+
+        axes[1, 1].fill_between(
+            stats_by_step.index,
+            stats_by_step["mean"] - stats_by_step["std"],
+            stats_by_step["mean"] + stats_by_step["std"],
+            alpha=0.3,
+            label="±1 std",
+        )
+        axes[1, 1].plot(stats_by_step.index, stats_by_step["mean"], label="Mean", linewidth=2)
+        axes[1, 1].plot(
+            stats_by_step.index, stats_by_step["max"], label="Max", linewidth=1, alpha=0.8
+        )
+        axes[1, 1].plot(
+            stats_by_step.index, stats_by_step["min"], label="Min", linewidth=1, alpha=0.8
+        )
+
+        axes[1, 1].set_xlabel("Training Step")
+        axes[1, 1].set_ylabel("Gradient Norm")
+        axes[1, 1].set_title("Gradient Norm Statistics Over Time")
+        axes[1, 1].set_yscale("log")
         axes[1, 1].legend()
         axes[1, 1].grid(True, alpha=0.3)
-        
+
         plt.tight_layout()
         plt.savefig("plots/grad_norm_over_time.png")
-    
+
     def generate_summary_report(self):
         """Generate a summary report of gradient norm statistics."""
         if self.df is None:
             print("No DataFrame available. Load data first.")
             return
-        
+
         print("=== GRADIENT NORMS ANALYSIS REPORT ===")
         print(f"Total data points: {len(self.df)}")
         print(f"Training steps: {self.df['num_samples'].nunique()}")
         print(f"Unique parameters: {self.df['parameter'].nunique()}")
         print()
-        
+
         print("Overall Statistics:")
         print(f"Mean gradient norm: {self.df['grad_norm'].mean():.6f}")
         print(f"Median gradient norm: {self.df['grad_norm'].median():.6f}")
         print(f"Min gradient norm: {self.df['grad_norm'].min():.6f}")
         print(f"Max gradient norm: {self.df['grad_norm'].max():.6f}")
         print()
-        
+
         print("Statistics by Layer Type:")
-        layer_stats = self.df.groupby('layer_type')['grad_norm'].agg(['count', 'mean', 'std', 'min', 'max'])
+        layer_stats = self.df.groupby("layer_type")["grad_norm"].agg(
+            ["count", "mean", "std", "min", "max"]
+        )
         print(layer_stats)
         print()
-        
+
         # Check for potential issues
         print("Potential Issues:")
-        very_small = (self.df['grad_norm'] < 1e-6).sum()
-        very_large = (self.df['grad_norm'] > 10.0).sum()
-        
+        very_small = (self.df["grad_norm"] < 1e-6).sum()
+        very_large = (self.df["grad_norm"] > 10.0).sum()
+
         if very_small > 0:
             print(f"⚠️  {very_small} gradient norms < 1e-6 (possible vanishing gradients)")
         if very_large > 0:
             print(f"⚠️  {very_large} gradient norms > 10.0 (possible exploding gradients)")
-        
+
         if very_small == 0 and very_large == 0:
             print("✅ No obvious gradient issues detected")
 
+
 # Usage example
 def analyze_gradient_file(json_file_path):
     """
     Main function to analyze gradient norms from a JSON file.
-    
+
     Usage:
     analyze_gradient_file('gradient_norms.jsonl')
     """
-    
+
     analyzer = GradientNormsAnalyzer(json_file_path)
-    
+
     # Generate summary report
     analyzer.generate_summary_report()
-    
+
     # Create all plots
     print("\n=== GENERATING PLOTS ===")
-    
+
     print("1. Total gradient norms over time...")
     analyzer.plot_total_gradient_norms()
-    
+
     print("2. Gradient norms by layer type...")
     analyzer.plot_layer_type_norms()
-    
+
     print("3. Layer depth analysis...")
     analyzer.plot_layer_depth_analysis()
-    
+
     print("4. Gradient distribution analysis...")
     analyzer.plot_gradient_distribution()
-    
+
     return analyzer
 
+
 # Example usage:
 # uv run python src/weathergen/utils/plot_grad_norms.py results/yvhxm2jc/yvhxm2jc_train_metrics.json
-if __name__ == '__main__':
+if __name__ == "__main__":
     import sys
+
     analyzer = analyze_gradient_file(sys.argv[1])

From 754d31c660d2fb6f40e285b59f5630971c519d73 Mon Sep 17 00:00:00 2001
From: Julian Kuehnert <julian.b.kuehnert@gmail.com>
Date: Wed, 8 Oct 2025 14:22:12 +0000
Subject: [PATCH 07/19] forecast config with small decoder

---
 config/default_config.yml         | 29 +++++++++++++++--------------
 config/streams/era5_1deg/era5.yml |  2 +-
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/config/default_config.yml b/config/default_config.yml
index 2ecf4f6b8..dde6fafbc 100644
--- a/config/default_config.yml
+++ b/config/default_config.yml
@@ -10,7 +10,7 @@ embed_dropout_rate: 0.1
 target_cell_local_prediction: True
 
 ae_local_dim_embed: 1024
-ae_local_num_blocks: 2
+ae_local_num_blocks: 0
 ae_local_num_heads: 16
 ae_local_dropout_rate: 0.1
 ae_local_with_qk_lnorm: True
@@ -24,7 +24,7 @@ ae_adapter_with_residual: True
 ae_adapter_dropout_rate: 0.1
 
 ae_global_dim_embed: 2048
-ae_global_num_blocks: 8
+ae_global_num_blocks: 4
 ae_global_num_heads: 32
 ae_global_dropout_rate: 0.1
 ae_global_with_qk_lnorm: True
@@ -34,18 +34,19 @@ ae_global_mlp_hidden_factor: 2
 
 decoder_type: PerceiverIOCoordConditioning # CrossAttentionAdaNormConditioning
 pred_adapter_kv: False
-pred_self_attention: True
+pred_self_attention: False
 pred_dyadic_dims: False
 pred_mlp_adaln: True
 
 # number of steps offset applied to first target window; if set to zero and forecast_steps=0 then
 # one is training an auto-encoder
-forecast_offset : 0
+forecast_offset : 1
 forecast_delta_hrs: 0
-forecast_steps: 0
-forecast_policy: null
+forecast_steps: 2
+forecast_policy: "fixed"
+forecast_freeze_model: False
 forecast_att_dense_rate: 1.0
-fe_num_blocks: 0
+fe_num_blocks: 8
 fe_num_heads: 16
 fe_dropout_rate: 0.1
 fe_with_qk_lnorm: True
@@ -85,7 +86,7 @@ freeze_modules: ""
 
 # training mode: "forecast" or "masking" (masked token modeling)
 # for "masking" to train with auto-encoder mode, forecast_offset should be 0
-training_mode: "masking"
+training_mode: "forecast"
 # masking rate when training mode is "masking"; ignored in foreacast mode
 masking_rate: 0.6
 # sample the masking rate (with normal distribution centered at masking_rate)
@@ -93,7 +94,7 @@ masking_rate: 0.6
 masking_rate_sampling: True
 # sample a subset of all target points, useful e.g. to reduce memory requirements (also can specify per-stream)
 sampling_rate_target: 1.0
-# include a masking strategy here, currently only supporting "random", "block", "healpix", "channel", "causal" and "combination"
+# include a masking strategy here, currently only supporting "random", "block", "healpix", "channel", "combination"
 masking_strategy: "random"
 # masking_strategy_config is a dictionary of additional parameters for the masking strategy
 # required for "healpix" and "channel" masking strategies
@@ -105,17 +106,17 @@ masking_strategy_config: {"strategies": ["random", "healpix", "channel"],
                           "same_strategy_per_batch": false
                           }
 
-num_epochs: 32
+num_epochs: 64
 samples_per_epoch: 4096
 samples_per_validation: 512
 shuffle: True
 
 lr_scaling_policy: "sqrt"
 lr_start: 1e-6
-lr_max: 5e-5
-lr_final_decay: 1e-6
+lr_max: 0.0001
+lr_final_decay: 2e-6
 lr_final: 0.0
-lr_steps_warmup: 512 
+lr_steps_warmup: 256
 lr_steps_cooldown: 512
 lr_policy_warmup: "cosine"
 lr_policy_decay: "linear"
@@ -151,4 +152,4 @@ run_id: ???
 # Parameters for logging/printing in the training loop
 train_log:
   # The period to log metrics (in number of batch steps)
-  log_interval: 20
+  log_interval: 20
\ No newline at end of file
diff --git a/config/streams/era5_1deg/era5.yml b/config/streams/era5_1deg/era5.yml
index a03bb3b40..aaf1bbf53 100644
--- a/config/streams/era5_1deg/era5.yml
+++ b/config/streams/era5_1deg/era5.yml
@@ -29,7 +29,7 @@ ERA5 :
     dim_embed : 256
   target_readout :
     type : 'obs_value'  # token or obs_value
-    num_layers : 2
+    num_layers : 1
     num_heads : 4
     # sampling_rate : 0.2
   pred_head :

From 7c756a3544c91e4f59962f8e1ae6290cf20a45ba Mon Sep 17 00:00:00 2001
From: Julian Kuehnert <julian.b.kuehnert@gmail.com>
Date: Thu, 9 Oct 2025 08:38:14 +0000
Subject: [PATCH 08/19] fixed uv.lock

---
 uv.lock | 292 +++++++++-----------------------------------------------
 1 file changed, 44 insertions(+), 248 deletions(-)

diff --git a/uv.lock b/uv.lock
index 56e875859..79a5b2e2f 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 3
+revision = 2
 requires-python = "==3.12.*"
 resolution-markers = [
     "platform_machine == 'aarch64' and sys_platform == 'linux'",
@@ -874,7 +874,7 @@ name = "jinja2"
 version = "3.1.6"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "markupsafe", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" },
+    { name = "markupsafe", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" }
 wheels = [
@@ -1251,52 +1251,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c2/1c/6d343e030815c7c97a1f9fbad00211b47717c7fe446834c224bd5311e6f1/numpy-2.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:bd8df082b6c4695753ad6193018c05aac465d634834dca47a3ae06d4bb22d9ea", size = 9891498, upload-time = "2025-06-07T14:43:36.332Z" },
 ]
 
-[[package]]
-name = "nvidia-cublas-cu12"
-version = "12.4.5.8"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "platform_machine == 'x86_64' and sys_platform == 'linux'",
-]
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/7f/7f/7fbae15a3982dc9595e49ce0f19332423b260045d0a6afe93cdbe2f1f624/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3", size = 363333771, upload-time = "2024-06-18T19:28:09.881Z" },
-    { url = "https://files.pythonhosted.org/packages/ae/71/1c91302526c45ab494c23f61c7a84aa568b8c1f9d196efa5993957faf906/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b", size = 363438805, upload-time = "2024-04-03T20:57:06.025Z" },
-    { url = "https://files.pythonhosted.org/packages/e2/2a/4f27ca96232e8b5269074a72e03b4e0d43aa68c9b965058b1684d07c6ff8/nvidia_cublas_cu12-12.4.5.8-py3-none-win_amd64.whl", hash = "sha256:5a796786da89203a0657eda402bcdcec6180254a8ac22d72213abc42069522dc", size = 396895858, upload-time = "2024-04-03T21:03:31.996Z" },
-]
-
 [[package]]
 name = "nvidia-cublas-cu12"
 version = "12.6.4.1"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "platform_machine == 'x86_64' and sys_platform == 'linux'",
-]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/af/eb/ff4b8c503fa1f1796679dce648854d58751982426e4e4b37d6fce49d259c/nvidia_cublas_cu12-12.6.4.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:08ed2686e9875d01b58e3cb379c6896df8e76c75e0d4a7f7dace3d7b6d9ef8eb", size = 393138322, upload-time = "2024-11-20T17:40:25.65Z" },
     { url = "https://files.pythonhosted.org/packages/97/0d/f1f0cadbf69d5b9ef2e4f744c9466cb0a850741d08350736dfdb4aa89569/nvidia_cublas_cu12-12.6.4.1-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:235f728d6e2a409eddf1df58d5b0921cf80cfa9e72b9f2775ccb7b4a87984668", size = 390794615, upload-time = "2024-11-20T17:39:52.715Z" },
     { url = "https://files.pythonhosted.org/packages/84/f7/985e9bdbe3e0ac9298fcc8cfa51a392862a46a0ffaccbbd56939b62a9c83/nvidia_cublas_cu12-12.6.4.1-py3-none-win_amd64.whl", hash = "sha256:9e4fa264f4d8a4eb0cdbd34beadc029f453b3bafae02401e999cf3d5a5af75f8", size = 434535301, upload-time = "2024-11-20T17:50:41.681Z" },
 ]
 
-[[package]]
-name = "nvidia-cuda-cupti-cu12"
-version = "12.4.127"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "platform_machine == 'x86_64' and sys_platform == 'linux'",
-]
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/93/b5/9fb3d00386d3361b03874246190dfec7b206fd74e6e287b26a8fcb359d95/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a", size = 12354556, upload-time = "2024-06-18T19:30:40.546Z" },
-    { url = "https://files.pythonhosted.org/packages/67/42/f4f60238e8194a3106d06a058d494b18e006c10bb2b915655bd9f6ea4cb1/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb", size = 13813957, upload-time = "2024-04-03T20:55:01.564Z" },
-    { url = "https://files.pythonhosted.org/packages/f3/79/8cf313ec17c58ccebc965568e5bcb265cdab0a1df99c4e674bb7a3b99bfe/nvidia_cuda_cupti_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:5688d203301ab051449a2b1cb6690fbe90d2b372f411521c86018b950f3d7922", size = 9938035, upload-time = "2024-04-03T21:01:01.109Z" },
-]
-
 [[package]]
 name = "nvidia-cuda-cupti-cu12"
 version = "12.6.80"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "platform_machine == 'x86_64' and sys_platform == 'linux'",
-]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/e6/8b/2f6230cb715646c3a9425636e513227ce5c93c4d65823a734f4bb86d43c3/nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:166ee35a3ff1587f2490364f90eeeb8da06cd867bd5b701bf7f9a02b78bc63fc", size = 8236764, upload-time = "2024-11-20T17:35:41.03Z" },
     { url = "https://files.pythonhosted.org/packages/25/0f/acb326ac8fd26e13c799e0b4f3b2751543e1834f04d62e729485872198d4/nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_aarch64.whl", hash = "sha256:358b4a1d35370353d52e12f0a7d1769fc01ff74a191689d3870b2123156184c4", size = 8236756, upload-time = "2024-10-01T16:57:45.507Z" },
@@ -1305,52 +1273,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1c/81/7796f096afaf726796b1b648f3bc80cafc61fe7f77f44a483c89e6c5ef34/nvidia_cuda_cupti_cu12-12.6.80-py3-none-win_amd64.whl", hash = "sha256:bbe6ae76e83ce5251b56e8c8e61a964f757175682bbad058b170b136266ab00a", size = 5724175, upload-time = "2024-10-01T17:09:47.955Z" },
 ]
 
-[[package]]
-name = "nvidia-cuda-nvrtc-cu12"
-version = "12.4.127"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "platform_machine == 'x86_64' and sys_platform == 'linux'",
-]
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/77/aa/083b01c427e963ad0b314040565ea396f914349914c298556484f799e61b/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198", size = 24133372, upload-time = "2024-06-18T19:32:00.576Z" },
-    { url = "https://files.pythonhosted.org/packages/2c/14/91ae57cd4db3f9ef7aa99f4019cfa8d54cb4caa7e00975df6467e9725a9f/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338", size = 24640306, upload-time = "2024-04-03T20:56:01.463Z" },
-    { url = "https://files.pythonhosted.org/packages/7c/30/8c844bfb770f045bcd8b2c83455c5afb45983e1a8abf0c4e5297b481b6a5/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:a961b2f1d5f17b14867c619ceb99ef6fcec12e46612711bcec78eb05068a60ec", size = 19751955, upload-time = "2024-04-03T21:01:51.133Z" },
-]
-
 [[package]]
 name = "nvidia-cuda-nvrtc-cu12"
 version = "12.6.77"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "platform_machine == 'x86_64' and sys_platform == 'linux'",
-]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/f4/2f/72df534873235983cc0a5371c3661bebef7c4682760c275590b972c7b0f9/nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5847f1d6e5b757f1d2b3991a01082a44aad6f10ab3c5c0213fa3e25bddc25a13", size = 23162955, upload-time = "2024-10-01T16:59:50.922Z" },
     { url = "https://files.pythonhosted.org/packages/75/2e/46030320b5a80661e88039f59060d1790298b4718944a65a7f2aeda3d9e9/nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:35b0cc6ee3a9636d5409133e79273ce1f3fd087abb0532d2d2e8fff1fe9efc53", size = 23650380, upload-time = "2024-10-01T17:00:14.643Z" },
     { url = "https://files.pythonhosted.org/packages/f5/46/d3a1cdda8bb113c80f43a0a6f3a853356d487b830f3483f92d49ce87fa55/nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-win_amd64.whl", hash = "sha256:f7007dbd914c56bd80ea31bc43e8e149da38f68158f423ba845fc3292684e45a", size = 39026742, upload-time = "2024-10-01T17:10:49.058Z" },
 ]
 
-[[package]]
-name = "nvidia-cuda-runtime-cu12"
-version = "12.4.127"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "platform_machine == 'x86_64' and sys_platform == 'linux'",
-]
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/a1/aa/b656d755f474e2084971e9a297def515938d56b466ab39624012070cb773/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3", size = 894177, upload-time = "2024-06-18T19:32:52.877Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/27/1795d86fe88ef397885f2e580ac37628ed058a92ed2c39dc8eac3adf0619/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5", size = 883737, upload-time = "2024-04-03T20:54:51.355Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/8b/450e93fab75d85a69b50ea2d5fdd4ff44541e0138db16f9cd90123ef4de4/nvidia_cuda_runtime_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:09c2e35f48359752dfa822c09918211844a3d93c100a715d79b59591130c5e1e", size = 878808, upload-time = "2024-04-03T21:00:49.77Z" },
-]
-
 [[package]]
 name = "nvidia-cuda-runtime-cu12"
 version = "12.6.77"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "platform_machine == 'x86_64' and sys_platform == 'linux'",
-]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/8f/ea/590b2ac00d772a8abd1c387a92b46486d2679ca6622fd25c18ff76265663/nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6116fad3e049e04791c0256a9778c16237837c08b27ed8c8401e2e45de8d60cd", size = 908052, upload-time = "2024-11-20T17:35:19.905Z" },
     { url = "https://files.pythonhosted.org/packages/b7/3d/159023799677126e20c8fd580cca09eeb28d5c5a624adc7f793b9aa8bbfa/nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d461264ecb429c84c8879a7153499ddc7b19b5f8d84c204307491989a365588e", size = 908040, upload-time = "2024-10-01T16:57:22.221Z" },
@@ -1359,30 +1295,12 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fa/76/4c80fa138333cc975743fd0687a745fccb30d167f906f13c1c7f9a85e5ea/nvidia_cuda_runtime_cu12-12.6.77-py3-none-win_amd64.whl", hash = "sha256:86c58044c824bf3c173c49a2dbc7a6c8b53cb4e4dca50068be0bf64e9dab3f7f", size = 891773, upload-time = "2024-10-01T17:09:26.362Z" },
 ]
 
-[[package]]
-name = "nvidia-cudnn-cu12"
-version = "9.1.0.70"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "platform_machine == 'x86_64' and sys_platform == 'linux'",
-]
-dependencies = [
-    { name = "nvidia-cublas-cu12", version = "12.4.5.8", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-]
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/9f/fd/713452cd72343f682b1c7b9321e23829f00b842ceaedcda96e742ea0b0b3/nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f", size = 664752741, upload-time = "2024-04-22T15:24:15.253Z" },
-    { url = "https://files.pythonhosted.org/packages/3f/d0/f90ee6956a628f9f04bf467932c0a25e5a7e706a684b896593c06c82f460/nvidia_cudnn_cu12-9.1.0.70-py3-none-win_amd64.whl", hash = "sha256:6278562929433d68365a07a4a1546c237ba2849852c0d4b2262a486e805b977a", size = 679925892, upload-time = "2024-04-22T15:24:53.333Z" },
-]
-
 [[package]]
 name = "nvidia-cudnn-cu12"
 version = "9.5.1.17"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "platform_machine == 'x86_64' and sys_platform == 'linux'",
-]
 dependencies = [
-    { name = "nvidia-cublas-cu12", version = "12.6.4.1", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/99/93/a201a12d3ec1caa8c6ac34c1c2f9eeb696b886f0c36ff23c638b46603bd0/nvidia_cudnn_cu12-9.5.1.17-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:9fd4584468533c61873e5fda8ca41bac3a38bcb2d12350830c69b0a96a7e4def", size = 570523509, upload-time = "2024-10-25T19:53:03.148Z" },
@@ -1390,31 +1308,12 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b6/b2/3f60d15f037fa5419d9d7f788b100ef33ea913ae5315c87ca6d6fa606c35/nvidia_cudnn_cu12-9.5.1.17-py3-none-win_amd64.whl", hash = "sha256:d7af0f8a4f3b4b9dbb3122f2ef553b45694ed9c384d5a75bab197b8eefb79ab8", size = 565440743, upload-time = "2024-10-25T19:55:49.74Z" },
 ]
 
-[[package]]
-name = "nvidia-cufft-cu12"
-version = "11.2.1.3"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "platform_machine == 'x86_64' and sys_platform == 'linux'",
-]
-dependencies = [
-    { name = "nvidia-nvjitlink-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-]
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/7a/8a/0e728f749baca3fbeffad762738276e5df60851958be7783af121a7221e7/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399", size = 211422548, upload-time = "2024-06-18T19:33:39.396Z" },
-    { url = "https://files.pythonhosted.org/packages/27/94/3266821f65b92b3138631e9c8e7fe1fb513804ac934485a8d05776e1dd43/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9", size = 211459117, upload-time = "2024-04-03T20:57:40.402Z" },
-    { url = "https://files.pythonhosted.org/packages/f6/ee/3f3f8e9874f0be5bbba8fb4b62b3de050156d159f8b6edc42d6f1074113b/nvidia_cufft_cu12-11.2.1.3-py3-none-win_amd64.whl", hash = "sha256:d802f4954291101186078ccbe22fc285a902136f974d369540fd4a5333d1440b", size = 210576476, upload-time = "2024-04-03T21:04:06.422Z" },
-]
-
 [[package]]
 name = "nvidia-cufft-cu12"
 version = "11.3.0.4"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "platform_machine == 'x86_64' and sys_platform == 'linux'",
-]
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12", version = "12.6.85", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/1f/37/c50d2b2f2c07e146776389e3080f4faf70bcc4fa6e19d65bb54ca174ebc3/nvidia_cufft_cu12-11.3.0.4-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d16079550df460376455cba121db6564089176d9bac9e4f360493ca4741b22a6", size = 200164144, upload-time = "2024-11-20T17:40:58.288Z" },
@@ -1424,26 +1323,10 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b4/38/36fd800cec8f6e89b7c1576edaaf8076e69ec631644cdbc1b5f2e2b5a9df/nvidia_cufft_cu12-11.3.0.4-py3-none-win_amd64.whl", hash = "sha256:6048ebddfb90d09d2707efb1fd78d4e3a77cb3ae4dc60e19aab6be0ece2ae464", size = 199356881, upload-time = "2024-10-01T17:13:01.861Z" },
 ]
 
-[[package]]
-name = "nvidia-curand-cu12"
-version = "10.3.5.147"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "platform_machine == 'x86_64' and sys_platform == 'linux'",
-]
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/80/9c/a79180e4d70995fdf030c6946991d0171555c6edf95c265c6b2bf7011112/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9", size = 56314811, upload-time = "2024-06-18T19:34:48.575Z" },
-    { url = "https://files.pythonhosted.org/packages/8a/6d/44ad094874c6f1b9c654f8ed939590bdc408349f137f9b98a3a23ccec411/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b", size = 56305206, upload-time = "2024-04-03T20:58:08.722Z" },
-    { url = "https://files.pythonhosted.org/packages/1c/22/2573503d0d4e45673c263a313f79410e110eb562636b0617856fdb2ff5f6/nvidia_curand_cu12-10.3.5.147-py3-none-win_amd64.whl", hash = "sha256:f307cc191f96efe9e8f05a87096abc20d08845a841889ef78cb06924437f6771", size = 55799918, upload-time = "2024-04-03T21:04:34.45Z" },
-]
-
 [[package]]
 name = "nvidia-curand-cu12"
 version = "10.3.7.77"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "platform_machine == 'x86_64' and sys_platform == 'linux'",
-]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/42/ac/36543605358a355632f1a6faa3e2d5dfb91eab1e4bc7d552040e0383c335/nvidia_curand_cu12-10.3.7.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:6e82df077060ea28e37f48a3ec442a8f47690c7499bff392a5938614b56c98d8", size = 56289881, upload-time = "2024-10-01T17:04:18.981Z" },
     { url = "https://files.pythonhosted.org/packages/73/1b/44a01c4e70933637c93e6e1a8063d1e998b50213a6b65ac5a9169c47e98e/nvidia_curand_cu12-10.3.7.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a42cd1344297f70b9e39a1e4f467a4e1c10f1da54ff7a85c12197f6c652c8bdf", size = 56279010, upload-time = "2024-11-20T17:42:50.958Z" },
@@ -1452,35 +1335,14 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a9/a8/0cd0cec757bd4b4b4ef150fca62ec064db7d08a291dced835a0be7d2c147/nvidia_curand_cu12-10.3.7.77-py3-none-win_amd64.whl", hash = "sha256:6d6d935ffba0f3d439b7cd968192ff068fafd9018dbf1b85b37261b13cfc9905", size = 55783873, upload-time = "2024-10-01T17:13:30.377Z" },
 ]
 
-[[package]]
-name = "nvidia-cusolver-cu12"
-version = "11.6.1.9"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "platform_machine == 'x86_64' and sys_platform == 'linux'",
-]
-dependencies = [
-    { name = "nvidia-cublas-cu12", version = "12.4.5.8", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusparse-cu12", version = "12.3.1.170", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nvjitlink-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-]
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/46/6b/a5c33cf16af09166845345275c34ad2190944bcc6026797a39f8e0a282e0/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e", size = 127634111, upload-time = "2024-06-18T19:35:01.793Z" },
-    { url = "https://files.pythonhosted.org/packages/3a/e1/5b9089a4b2a4790dfdea8b3a006052cfecff58139d5a4e34cb1a51df8d6f/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260", size = 127936057, upload-time = "2024-04-03T20:58:28.735Z" },
-    { url = "https://files.pythonhosted.org/packages/f2/be/d435b7b020e854d5d5a682eb5de4328fd62f6182507406f2818280e206e2/nvidia_cusolver_cu12-11.6.1.9-py3-none-win_amd64.whl", hash = "sha256:e77314c9d7b694fcebc84f58989f3aa4fb4cb442f12ca1a9bde50f5e8f6d1b9c", size = 125224015, upload-time = "2024-04-03T21:04:53.339Z" },
-]
-
 [[package]]
 name = "nvidia-cusolver-cu12"
 version = "11.7.1.2"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "platform_machine == 'x86_64' and sys_platform == 'linux'",
-]
 dependencies = [
-    { name = "nvidia-cublas-cu12", version = "12.6.4.1", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusparse-cu12", version = "12.5.4.2", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nvjitlink-cu12", version = "12.6.85", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/93/17/dbe1aa865e4fdc7b6d4d0dd308fdd5aaab60f939abfc0ea1954eac4fb113/nvidia_cusolver_cu12-11.7.1.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0ce237ef60acde1efc457335a2ddadfd7610b892d94efee7b776c64bb1cac9e0", size = 157833628, upload-time = "2024-10-01T17:05:05.591Z" },
@@ -1490,31 +1352,12 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d4/53/fff50a0808df7113d77e3bbc7c2b7eaed6f57d5eb80fbe93ead2aea1e09a/nvidia_cusolver_cu12-11.7.1.2-py3-none-win_amd64.whl", hash = "sha256:6813f9d8073f555444a8705f3ab0296d3e1cb37a16d694c5fc8b862a0d8706d7", size = 149287877, upload-time = "2024-10-01T17:13:49.804Z" },
 ]
 
-[[package]]
-name = "nvidia-cusparse-cu12"
-version = "12.3.1.170"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "platform_machine == 'x86_64' and sys_platform == 'linux'",
-]
-dependencies = [
-    { name = "nvidia-nvjitlink-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-]
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/96/a9/c0d2f83a53d40a4a41be14cea6a0bf9e668ffcf8b004bd65633f433050c0/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3", size = 207381987, upload-time = "2024-06-18T19:35:32.989Z" },
-    { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763, upload-time = "2024-04-03T20:58:59.995Z" },
-    { url = "https://files.pythonhosted.org/packages/a2/e0/3155ca539760a8118ec94cc279b34293309bcd14011fc724f87f31988843/nvidia_cusparse_cu12-12.3.1.170-py3-none-win_amd64.whl", hash = "sha256:9bc90fb087bc7b4c15641521f31c0371e9a612fc2ba12c338d3ae032e6b6797f", size = 204684315, upload-time = "2024-04-03T21:05:26.031Z" },
-]
-
 [[package]]
 name = "nvidia-cusparse-cu12"
 version = "12.5.4.2"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "platform_machine == 'x86_64' and sys_platform == 'linux'",
-]
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12", version = "12.6.85", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/eb/eb/6681efd0aa7df96b4f8067b3ce7246833dd36830bb4cec8896182773db7d/nvidia_cusparse_cu12-12.5.4.2-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d25b62fb18751758fe3c93a4a08eff08effedfe4edf1c6bb5afd0890fe88f887", size = 216451147, upload-time = "2024-11-20T17:44:18.055Z" },
@@ -1524,26 +1367,10 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/45/ef/876ad8e4260e1128e6d4aac803d9d51baf3791ebdb4a9b8d9b8db032b4b0/nvidia_cusparse_cu12-12.5.4.2-py3-none-win_amd64.whl", hash = "sha256:4acb8c08855a26d737398cba8fb6f8f5045d93f82612b4cfd84645a2332ccf20", size = 213712630, upload-time = "2024-10-01T17:14:23.779Z" },
 ]
 
-[[package]]
-name = "nvidia-cusparselt-cu12"
-version = "0.6.2"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "platform_machine == 'x86_64' and sys_platform == 'linux'",
-]
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/98/8e/675498726c605c9441cf46653bd29cb1b8666da1fb1469ffa25f67f20c58/nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:067a7f6d03ea0d4841c85f0c6f1991c5dda98211f6302cb83a4ab234ee95bef8", size = 149422781, upload-time = "2024-07-23T17:35:27.203Z" },
-    { url = "https://files.pythonhosted.org/packages/78/a8/bcbb63b53a4b1234feeafb65544ee55495e1bb37ec31b999b963cbccfd1d/nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:df2c24502fd76ebafe7457dbc4716b2fec071aabaed4fb7691a201cde03704d9", size = 150057751, upload-time = "2024-07-23T02:35:53.074Z" },
-    { url = "https://files.pythonhosted.org/packages/56/8f/2c33082238b6c5e783a877dc8786ab62619e3e6171c083bd3bba6e3fe75e/nvidia_cusparselt_cu12-0.6.2-py3-none-win_amd64.whl", hash = "sha256:0057c91d230703924c0422feabe4ce768841f9b4b44d28586b6f6d2eb86fbe70", size = 148755794, upload-time = "2024-07-23T02:35:00.261Z" },
-]
-
 [[package]]
 name = "nvidia-cusparselt-cu12"
 version = "0.6.3"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "platform_machine == 'x86_64' and sys_platform == 'linux'",
-]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/62/da/4de092c61c6dea1fc9c936e69308a02531d122e12f1f649825934ad651b5/nvidia_cusparselt_cu12-0.6.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:8371549623ba601a06322af2133c4a44350575f5a3108fb75f3ef20b822ad5f1", size = 156402859, upload-time = "2024-10-16T02:23:17.184Z" },
     { url = "https://files.pythonhosted.org/packages/3b/9a/72ef35b399b0e183bc2e8f6f558036922d453c4d8237dab26c666a04244b/nvidia_cusparselt_cu12-0.6.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:e5c8a26c36445dd2e6812f1177978a24e2d37cacce7e090f297a688d1ec44f46", size = 156785796, upload-time = "2024-10-15T21:29:17.709Z" },
@@ -1567,52 +1394,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/df/99/12cd266d6233f47d00daf3a72739872bdc10267d0383508b0b9c84a18bb6/nvidia_nccl_cu12-2.21.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:8579076d30a8c24988834445f8d633c697d42397e92ffc3f63fa26766d25e0a0", size = 188654414, upload-time = "2024-04-03T15:32:57.427Z" },
 ]
 
-[[package]]
-name = "nvidia-nvjitlink-cu12"
-version = "12.4.127"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "platform_machine == 'x86_64' and sys_platform == 'linux'",
-]
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/02/45/239d52c05074898a80a900f49b1615d81c07fceadd5ad6c4f86a987c0bc4/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83", size = 20552510, upload-time = "2024-06-18T20:20:13.871Z" },
-    { url = "https://files.pythonhosted.org/packages/ff/ff/847841bacfbefc97a00036e0fce5a0f086b640756dc38caea5e1bb002655/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57", size = 21066810, upload-time = "2024-04-03T20:59:46.957Z" },
-    { url = "https://files.pythonhosted.org/packages/81/19/0babc919031bee42620257b9a911c528f05fb2688520dcd9ca59159ffea8/nvidia_nvjitlink_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:fd9020c501d27d135f983c6d3e244b197a7ccad769e34df53a42e276b0e25fa1", size = 95336325, upload-time = "2024-04-03T21:06:25.073Z" },
-]
-
 [[package]]
 name = "nvidia-nvjitlink-cu12"
 version = "12.6.85"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "platform_machine == 'x86_64' and sys_platform == 'linux'",
-]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/9d/d7/c5383e47c7e9bf1c99d5bd2a8c935af2b6d705ad831a7ec5c97db4d82f4f/nvidia_nvjitlink_cu12-12.6.85-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:eedc36df9e88b682efe4309aa16b5b4e78c2407eac59e8c10a6a47535164369a", size = 19744971, upload-time = "2024-11-20T17:46:53.366Z" },
     { url = "https://files.pythonhosted.org/packages/31/db/dc71113d441f208cdfe7ae10d4983884e13f464a6252450693365e166dcf/nvidia_nvjitlink_cu12-12.6.85-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cf4eaa7d4b6b543ffd69d6abfb11efdeb2db48270d94dfd3a452c24150829e41", size = 19270338, upload-time = "2024-11-20T17:46:29.758Z" },
     { url = "https://files.pythonhosted.org/packages/89/76/93c1467b1387387440a4d25102d86b7794535449b689f8e2dc22c1c8ff7f/nvidia_nvjitlink_cu12-12.6.85-py3-none-win_amd64.whl", hash = "sha256:e61120e52ed675747825cdd16febc6a0730537451d867ee58bee3853b1b13d1c", size = 161908572, upload-time = "2024-11-20T17:52:40.124Z" },
 ]
 
-[[package]]
-name = "nvidia-nvtx-cu12"
-version = "12.4.127"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "platform_machine == 'x86_64' and sys_platform == 'linux'",
-]
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/06/39/471f581edbb7804b39e8063d92fc8305bdc7a80ae5c07dbe6ea5c50d14a5/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3", size = 100417, upload-time = "2024-06-18T20:16:22.484Z" },
-    { url = "https://files.pythonhosted.org/packages/87/20/199b8713428322a2f22b722c62b8cc278cc53dffa9705d744484b5035ee9/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a", size = 99144, upload-time = "2024-04-03T20:56:12.406Z" },
-    { url = "https://files.pythonhosted.org/packages/54/1b/f77674fbb73af98843be25803bbd3b9a4f0a96c75b8d33a2854a5c7d2d77/nvidia_nvtx_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:641dccaaa1139f3ffb0d3164b4b84f9d253397e38246a4f2f36728b48566d485", size = 66307, upload-time = "2024-04-03T21:02:01.959Z" },
-]
-
 [[package]]
 name = "nvidia-nvtx-cu12"
 version = "12.6.77"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "platform_machine == 'x86_64' and sys_platform == 'linux'",
-]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/b9/93/80f8a520375af9d7ee44571a6544653a176e53c2b8ccce85b97b83c2491b/nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f44f8d86bb7d5629988d61c8d3ae61dddb2015dee142740536bc7481b022fe4b", size = 90549, upload-time = "2024-11-20T17:38:17.387Z" },
     { url = "https://files.pythonhosted.org/packages/2b/53/36e2fd6c7068997169b49ffc8c12d5af5e5ff209df6e1a2c4d373b3a638f/nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:adcaabb9d436c9761fca2b13959a2d237c5f9fd406c8e4b723c695409ff88059", size = 90539, upload-time = "2024-10-01T17:00:27.179Z" },
@@ -2283,6 +2078,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e6/eb/3bf6ea8ab7f1503dca3a10df2e4b9c3f6b3316df07f6c0ded94b281c7101/scipy-1.15.3-cp312-cp312-win_amd64.whl", hash = "sha256:52092bc0472cfd17df49ff17e70624345efece4e1a12b23783a1ac59a1b728ed", size = 40966184, upload-time = "2025-05-08T16:06:52.623Z" },
 ]
 
+[[package]]
+name = "seaborn"
+version = "0.13.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "matplotlib", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" },
+    { name = "numpy", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" },
+    { name = "pandas", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/86/59/a451d7420a77ab0b98f7affa3a1d78a313d2f7281a57afb1a34bae8ab412/seaborn-0.13.2.tar.gz", hash = "sha256:93e60a40988f4d65e9f4885df477e2fdaff6b73a9ded434c1ab356dd57eefff7", size = 1457696, upload-time = "2024-01-25T13:21:52.551Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/83/11/00d3c3dfc25ad54e731d91449895a79e4bf2384dc3ac01809010ba88f6d5/seaborn-0.13.2-py3-none-any.whl", hash = "sha256:636f8336facf092165e27924f223d3c62ca560b1f2bb5dff7ab7fad265361987", size = 294914, upload-time = "2024-01-25T13:21:49.598Z" },
+]
+
 [[package]]
 name = "semantic-version"
 version = "2.10.0"
@@ -2426,8 +2235,8 @@ wheels = [
 
 [[package]]
 name = "torch"
-version = "2.6.0"
-source = { registry = "https://pypi.org/simple" }
+version = "2.6.0+cpu"
+source = { registry = "https://download.pytorch.org/whl/cpu" }
 resolution-markers = [
     "platform_machine == 'aarch64' and sys_platform == 'linux'",
     "platform_machine == 'x86_64' and sys_platform == 'linux'",
@@ -2437,29 +2246,14 @@ dependencies = [
     { name = "fsspec", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "jinja2", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "networkx", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "nvidia-cublas-cu12", version = "12.4.5.8", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cuda-cupti-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cuda-nvrtc-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cuda-runtime-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cudnn-cu12", version = "9.1.0.70", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cufft-cu12", version = "11.2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-curand-cu12", version = "10.3.5.147", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusolver-cu12", version = "11.6.1.9", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusparse-cu12", version = "12.3.1.170", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusparselt-cu12", version = "0.6.2", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nvjitlink-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nvtx-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "setuptools", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "sympy", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
-    { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "typing-extensions", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/e5/35/0c52d708144c2deb595cd22819a609f78fdd699b95ff6f0ebcd456e3c7c1/torch-2.6.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:2bb8987f3bb1ef2675897034402373ddfc8f5ef0e156e2d8cfc47cacafdda4a9", size = 766624563, upload-time = "2025-01-29T16:23:19.084Z" },
-    { url = "https://files.pythonhosted.org/packages/01/d6/455ab3fbb2c61c71c8842753b566012e1ed111e7a4c82e0e1c20d0c76b62/torch-2.6.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b789069020c5588c70d5c2158ac0aa23fd24a028f34a8b4fcb8fcb4d7efcf5fb", size = 95607867, upload-time = "2025-01-29T16:25:55.649Z" },
-    { url = "https://files.pythonhosted.org/packages/18/cf/ae99bd066571656185be0d88ee70abc58467b76f2f7c8bfeb48735a71fe6/torch-2.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:7e1448426d0ba3620408218b50aa6ada88aeae34f7a239ba5431f6c8774b1239", size = 204120469, upload-time = "2025-01-29T16:24:01.821Z" },
-    { url = "https://files.pythonhosted.org/packages/81/b4/605ae4173aa37fb5aa14605d100ff31f4f5d49f617928c9f486bb3aaec08/torch-2.6.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:9a610afe216a85a8b9bc9f8365ed561535c93e804c2a317ef7fabcc5deda0989", size = 66532538, upload-time = "2025-01-29T16:24:18.976Z" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp312-cp312-linux_x86_64.whl", hash = "sha256:59e78aa0c690f70734e42670036d6b541930b8eabbaa18d94e090abf14cc4d91" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:318290e8924353c61b125cdc8768d15208704e279e7757c113b9620740deca98" },
+    { url = "https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp312-cp312-win_amd64.whl", hash = "sha256:4027d982eb2781c93825ab9527f17fbbb12dbabf422298e4b954be60016f87d8" },
 ]
 
 [[package]]
@@ -2508,19 +2302,19 @@ dependencies = [
     { name = "fsspec", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "jinja2", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "networkx", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cublas-cu12", version = "12.6.4.1", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cuda-cupti-cu12", version = "12.6.80", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cuda-nvrtc-cu12", version = "12.6.77", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cuda-runtime-cu12", version = "12.6.77", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cudnn-cu12", version = "9.5.1.17", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cufft-cu12", version = "11.3.0.4", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-curand-cu12", version = "10.3.7.77", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusolver-cu12", version = "11.7.1.2", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusparse-cu12", version = "12.5.4.2", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusparselt-cu12", version = "0.6.3", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nvjitlink-cu12", version = "12.6.85", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nvtx-cu12", version = "12.6.77", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "setuptools", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "sympy", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
@@ -2687,6 +2481,7 @@ dependencies = [
     { name = "polars", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" },
     { name = "psutil", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" },
     { name = "pynvml", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" },
+    { name = "seaborn", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" },
     { name = "tqdm", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" },
     { name = "weathergen-common", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" },
     { name = "weathergen-evaluate", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" },
@@ -2696,7 +2491,7 @@ dependencies = [
 
 [package.optional-dependencies]
 cpu = [
-    { name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "torch", version = "2.6.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
 ]
 gpu = [
     { name = "flash-attn", version = "2.7.3", source = { url = "https://object-store.os-api.cci1.ecmwf.int/weathergenerator-dev/wheels/flash_attn-2.7.3-cp312-cp312-linux_aarch64.whl" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'extra-10-weathergen-gpu') or (platform_machine != 'aarch64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" },
@@ -2735,11 +2530,12 @@ requires-dist = [
     { name = "polars", specifier = "~=1.25.2" },
     { name = "psutil" },
     { name = "pynvml" },
+    { name = "seaborn", specifier = ">=0.13.2" },
     { name = "torch", marker = "platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'gpu'", url = "https://download.pytorch.org/whl/cu126/torch-2.6.0%2Bcu126-cp312-cp312-linux_aarch64.whl" },
     { name = "torch", marker = "platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'gpu'", url = "https://download.pytorch.org/whl/cu126/torch-2.6.0%2Bcu126-cp312-cp312-manylinux_2_28_x86_64.whl" },
+    { name = "torch", marker = "sys_platform == 'linux' and extra == 'cpu'", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cpu", conflict = { package = "weathergen", extra = "cpu" } },
+    { name = "torch", marker = "sys_platform != 'linux' and extra == 'cpu'", specifier = "==2.6.0" },
     { name = "torch", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'gpu') or (sys_platform != 'linux' and extra == 'gpu')", specifier = "==2.6.0+cu126" },
-    { name = "torch", marker = "sys_platform == 'macosx' and extra == 'cpu'", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cpu", conflict = { package = "weathergen", extra = "cpu" } },
-    { name = "torch", marker = "sys_platform != 'macosx' and extra == 'cpu'", specifier = "==2.6.0" },
     { name = "tqdm" },
     { name = "weathergen-common", editable = "packages/common" },
     { name = "weathergen-evaluate", editable = "packages/evaluate" },

From 41716a670c0fbddbe96a3433210ff9d3cd717236 Mon Sep 17 00:00:00 2001
From: Julian Kuehnert <julian.b.kuehnert@gmail.com>
Date: Thu, 9 Oct 2025 16:02:02 +0000
Subject: [PATCH 09/19] test gradient logging on mutli gpus

---
 config/default_config.yml       |  2 +-
 src/weathergen/train/trainer.py | 21 ++++++++++++++++-----
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/config/default_config.yml b/config/default_config.yml
index b14fddcba..d67d5359e 100644
--- a/config/default_config.yml
+++ b/config/default_config.yml
@@ -128,7 +128,7 @@ grad_clip: 1.0
 weight_decay: 0.1
 norm_type: "LayerNorm"
 nn_module: "te"
-log_grad_norms: False
+log_grad_norms: True
 
 start_date: 197901010000
 end_date: 202012310000
diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py
index 17f7e4433..83515d317 100644
--- a/src/weathergen/train/trainer.py
+++ b/src/weathergen/train/trainer.py
@@ -930,12 +930,23 @@ def _log_instant_grad_norms(self, stage: Stage, total_norm):
 
         TODO test DDP case
         """
-        grad_norms = {"total_grad_norm": total_norm.item()}
-        self.last_grad_norm = total_norm.item()
-        for name, param in self.ddp_model.named_parameters():
+        self.last_grad_norm = (
+            total_norm.full_tensor().item() if self.cf.world_size > 1 else total_norm.item()
+        )
+        grad_norms = {"total_grad_norm": self.last_grad_norm}
+        for name, param in self.model.named_parameters():
             if param.grad is not None:
-                grad_norms["grad_norm_" + name] = param.grad.norm().item()
-        self.train_logger.log_metrics(TRAIN, grad_norms)
+                # grad_norms["grad_norm_" + name] = param.grad.norm().item()
+                grad_norms["grad_norm_" + name] = (
+                    param.grad.norm().full_tensor().item()
+                    if self.cf.world_size > 1
+                    else param.grad.norm().item()
+                )
+
+        # print(".item():", param.grad.norm().item())
+        # print(".full_tensor().item()", param.grad.norm().full_tensor().item())
+        if is_root():
+            self.train_logger.log_metrics(TRAIN, grad_norms)
 
     def _log_terminal(self, bidx: int, epoch: int, stage: Stage):
         if bidx % self.print_freq == 0 and bidx > 0 or stage == VAL:

From 8bdbac41ec37c4ed18bb9e2fed54af90c20ca60f Mon Sep 17 00:00:00 2001
From: Julian Kuehnert <julian.b.kuehnert@gmail.com>
Date: Mon, 13 Oct 2025 13:24:44 +0000
Subject: [PATCH 10/19] update uv.lock to latest develop version

---
 uv.lock | 292 +++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 248 insertions(+), 44 deletions(-)

diff --git a/uv.lock b/uv.lock
index 79a5b2e2f..56e875859 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 2
+revision = 3
 requires-python = "==3.12.*"
 resolution-markers = [
     "platform_machine == 'aarch64' and sys_platform == 'linux'",
@@ -874,7 +874,7 @@ name = "jinja2"
 version = "3.1.6"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "markupsafe", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "markupsafe", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" }
 wheels = [
@@ -1251,20 +1251,52 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c2/1c/6d343e030815c7c97a1f9fbad00211b47717c7fe446834c224bd5311e6f1/numpy-2.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:bd8df082b6c4695753ad6193018c05aac465d634834dca47a3ae06d4bb22d9ea", size = 9891498, upload-time = "2025-06-07T14:43:36.332Z" },
 ]
 
+[[package]]
+name = "nvidia-cublas-cu12"
+version = "12.4.5.8"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7f/7f/7fbae15a3982dc9595e49ce0f19332423b260045d0a6afe93cdbe2f1f624/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3", size = 363333771, upload-time = "2024-06-18T19:28:09.881Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/71/1c91302526c45ab494c23f61c7a84aa568b8c1f9d196efa5993957faf906/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b", size = 363438805, upload-time = "2024-04-03T20:57:06.025Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/2a/4f27ca96232e8b5269074a72e03b4e0d43aa68c9b965058b1684d07c6ff8/nvidia_cublas_cu12-12.4.5.8-py3-none-win_amd64.whl", hash = "sha256:5a796786da89203a0657eda402bcdcec6180254a8ac22d72213abc42069522dc", size = 396895858, upload-time = "2024-04-03T21:03:31.996Z" },
+]
+
 [[package]]
 name = "nvidia-cublas-cu12"
 version = "12.6.4.1"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/af/eb/ff4b8c503fa1f1796679dce648854d58751982426e4e4b37d6fce49d259c/nvidia_cublas_cu12-12.6.4.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:08ed2686e9875d01b58e3cb379c6896df8e76c75e0d4a7f7dace3d7b6d9ef8eb", size = 393138322, upload-time = "2024-11-20T17:40:25.65Z" },
     { url = "https://files.pythonhosted.org/packages/97/0d/f1f0cadbf69d5b9ef2e4f744c9466cb0a850741d08350736dfdb4aa89569/nvidia_cublas_cu12-12.6.4.1-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:235f728d6e2a409eddf1df58d5b0921cf80cfa9e72b9f2775ccb7b4a87984668", size = 390794615, upload-time = "2024-11-20T17:39:52.715Z" },
     { url = "https://files.pythonhosted.org/packages/84/f7/985e9bdbe3e0ac9298fcc8cfa51a392862a46a0ffaccbbd56939b62a9c83/nvidia_cublas_cu12-12.6.4.1-py3-none-win_amd64.whl", hash = "sha256:9e4fa264f4d8a4eb0cdbd34beadc029f453b3bafae02401e999cf3d5a5af75f8", size = 434535301, upload-time = "2024-11-20T17:50:41.681Z" },
 ]
 
+[[package]]
+name = "nvidia-cuda-cupti-cu12"
+version = "12.4.127"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/93/b5/9fb3d00386d3361b03874246190dfec7b206fd74e6e287b26a8fcb359d95/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a", size = 12354556, upload-time = "2024-06-18T19:30:40.546Z" },
+    { url = "https://files.pythonhosted.org/packages/67/42/f4f60238e8194a3106d06a058d494b18e006c10bb2b915655bd9f6ea4cb1/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb", size = 13813957, upload-time = "2024-04-03T20:55:01.564Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/79/8cf313ec17c58ccebc965568e5bcb265cdab0a1df99c4e674bb7a3b99bfe/nvidia_cuda_cupti_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:5688d203301ab051449a2b1cb6690fbe90d2b372f411521c86018b950f3d7922", size = 9938035, upload-time = "2024-04-03T21:01:01.109Z" },
+]
+
 [[package]]
 name = "nvidia-cuda-cupti-cu12"
 version = "12.6.80"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/e6/8b/2f6230cb715646c3a9425636e513227ce5c93c4d65823a734f4bb86d43c3/nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:166ee35a3ff1587f2490364f90eeeb8da06cd867bd5b701bf7f9a02b78bc63fc", size = 8236764, upload-time = "2024-11-20T17:35:41.03Z" },
     { url = "https://files.pythonhosted.org/packages/25/0f/acb326ac8fd26e13c799e0b4f3b2751543e1834f04d62e729485872198d4/nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_aarch64.whl", hash = "sha256:358b4a1d35370353d52e12f0a7d1769fc01ff74a191689d3870b2123156184c4", size = 8236756, upload-time = "2024-10-01T16:57:45.507Z" },
@@ -1273,20 +1305,52 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1c/81/7796f096afaf726796b1b648f3bc80cafc61fe7f77f44a483c89e6c5ef34/nvidia_cuda_cupti_cu12-12.6.80-py3-none-win_amd64.whl", hash = "sha256:bbe6ae76e83ce5251b56e8c8e61a964f757175682bbad058b170b136266ab00a", size = 5724175, upload-time = "2024-10-01T17:09:47.955Z" },
 ]
 
+[[package]]
+name = "nvidia-cuda-nvrtc-cu12"
+version = "12.4.127"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/77/aa/083b01c427e963ad0b314040565ea396f914349914c298556484f799e61b/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198", size = 24133372, upload-time = "2024-06-18T19:32:00.576Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/14/91ae57cd4db3f9ef7aa99f4019cfa8d54cb4caa7e00975df6467e9725a9f/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338", size = 24640306, upload-time = "2024-04-03T20:56:01.463Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/30/8c844bfb770f045bcd8b2c83455c5afb45983e1a8abf0c4e5297b481b6a5/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:a961b2f1d5f17b14867c619ceb99ef6fcec12e46612711bcec78eb05068a60ec", size = 19751955, upload-time = "2024-04-03T21:01:51.133Z" },
+]
+
 [[package]]
 name = "nvidia-cuda-nvrtc-cu12"
 version = "12.6.77"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/f4/2f/72df534873235983cc0a5371c3661bebef7c4682760c275590b972c7b0f9/nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5847f1d6e5b757f1d2b3991a01082a44aad6f10ab3c5c0213fa3e25bddc25a13", size = 23162955, upload-time = "2024-10-01T16:59:50.922Z" },
     { url = "https://files.pythonhosted.org/packages/75/2e/46030320b5a80661e88039f59060d1790298b4718944a65a7f2aeda3d9e9/nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:35b0cc6ee3a9636d5409133e79273ce1f3fd087abb0532d2d2e8fff1fe9efc53", size = 23650380, upload-time = "2024-10-01T17:00:14.643Z" },
     { url = "https://files.pythonhosted.org/packages/f5/46/d3a1cdda8bb113c80f43a0a6f3a853356d487b830f3483f92d49ce87fa55/nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-win_amd64.whl", hash = "sha256:f7007dbd914c56bd80ea31bc43e8e149da38f68158f423ba845fc3292684e45a", size = 39026742, upload-time = "2024-10-01T17:10:49.058Z" },
 ]
 
+[[package]]
+name = "nvidia-cuda-runtime-cu12"
+version = "12.4.127"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a1/aa/b656d755f474e2084971e9a297def515938d56b466ab39624012070cb773/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3", size = 894177, upload-time = "2024-06-18T19:32:52.877Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/27/1795d86fe88ef397885f2e580ac37628ed058a92ed2c39dc8eac3adf0619/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5", size = 883737, upload-time = "2024-04-03T20:54:51.355Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/8b/450e93fab75d85a69b50ea2d5fdd4ff44541e0138db16f9cd90123ef4de4/nvidia_cuda_runtime_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:09c2e35f48359752dfa822c09918211844a3d93c100a715d79b59591130c5e1e", size = 878808, upload-time = "2024-04-03T21:00:49.77Z" },
+]
+
 [[package]]
 name = "nvidia-cuda-runtime-cu12"
 version = "12.6.77"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/8f/ea/590b2ac00d772a8abd1c387a92b46486d2679ca6622fd25c18ff76265663/nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6116fad3e049e04791c0256a9778c16237837c08b27ed8c8401e2e45de8d60cd", size = 908052, upload-time = "2024-11-20T17:35:19.905Z" },
     { url = "https://files.pythonhosted.org/packages/b7/3d/159023799677126e20c8fd580cca09eeb28d5c5a624adc7f793b9aa8bbfa/nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d461264ecb429c84c8879a7153499ddc7b19b5f8d84c204307491989a365588e", size = 908040, upload-time = "2024-10-01T16:57:22.221Z" },
@@ -1295,12 +1359,30 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fa/76/4c80fa138333cc975743fd0687a745fccb30d167f906f13c1c7f9a85e5ea/nvidia_cuda_runtime_cu12-12.6.77-py3-none-win_amd64.whl", hash = "sha256:86c58044c824bf3c173c49a2dbc7a6c8b53cb4e4dca50068be0bf64e9dab3f7f", size = 891773, upload-time = "2024-10-01T17:09:26.362Z" },
 ]
 
+[[package]]
+name = "nvidia-cudnn-cu12"
+version = "9.1.0.70"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
+dependencies = [
+    { name = "nvidia-cublas-cu12", version = "12.4.5.8", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9f/fd/713452cd72343f682b1c7b9321e23829f00b842ceaedcda96e742ea0b0b3/nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f", size = 664752741, upload-time = "2024-04-22T15:24:15.253Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/d0/f90ee6956a628f9f04bf467932c0a25e5a7e706a684b896593c06c82f460/nvidia_cudnn_cu12-9.1.0.70-py3-none-win_amd64.whl", hash = "sha256:6278562929433d68365a07a4a1546c237ba2849852c0d4b2262a486e805b977a", size = 679925892, upload-time = "2024-04-22T15:24:53.333Z" },
+]
+
 [[package]]
 name = "nvidia-cudnn-cu12"
 version = "9.5.1.17"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
 dependencies = [
-    { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cublas-cu12", version = "12.6.4.1", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/99/93/a201a12d3ec1caa8c6ac34c1c2f9eeb696b886f0c36ff23c638b46603bd0/nvidia_cudnn_cu12-9.5.1.17-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:9fd4584468533c61873e5fda8ca41bac3a38bcb2d12350830c69b0a96a7e4def", size = 570523509, upload-time = "2024-10-25T19:53:03.148Z" },
@@ -1308,12 +1390,31 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b6/b2/3f60d15f037fa5419d9d7f788b100ef33ea913ae5315c87ca6d6fa606c35/nvidia_cudnn_cu12-9.5.1.17-py3-none-win_amd64.whl", hash = "sha256:d7af0f8a4f3b4b9dbb3122f2ef553b45694ed9c384d5a75bab197b8eefb79ab8", size = 565440743, upload-time = "2024-10-25T19:55:49.74Z" },
 ]
 
+[[package]]
+name = "nvidia-cufft-cu12"
+version = "11.2.1.3"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
+dependencies = [
+    { name = "nvidia-nvjitlink-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7a/8a/0e728f749baca3fbeffad762738276e5df60851958be7783af121a7221e7/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399", size = 211422548, upload-time = "2024-06-18T19:33:39.396Z" },
+    { url = "https://files.pythonhosted.org/packages/27/94/3266821f65b92b3138631e9c8e7fe1fb513804ac934485a8d05776e1dd43/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9", size = 211459117, upload-time = "2024-04-03T20:57:40.402Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/ee/3f3f8e9874f0be5bbba8fb4b62b3de050156d159f8b6edc42d6f1074113b/nvidia_cufft_cu12-11.2.1.3-py3-none-win_amd64.whl", hash = "sha256:d802f4954291101186078ccbe22fc285a902136f974d369540fd4a5333d1440b", size = 210576476, upload-time = "2024-04-03T21:04:06.422Z" },
+]
+
 [[package]]
 name = "nvidia-cufft-cu12"
 version = "11.3.0.4"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", version = "12.6.85", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/1f/37/c50d2b2f2c07e146776389e3080f4faf70bcc4fa6e19d65bb54ca174ebc3/nvidia_cufft_cu12-11.3.0.4-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d16079550df460376455cba121db6564089176d9bac9e4f360493ca4741b22a6", size = 200164144, upload-time = "2024-11-20T17:40:58.288Z" },
@@ -1323,10 +1424,26 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b4/38/36fd800cec8f6e89b7c1576edaaf8076e69ec631644cdbc1b5f2e2b5a9df/nvidia_cufft_cu12-11.3.0.4-py3-none-win_amd64.whl", hash = "sha256:6048ebddfb90d09d2707efb1fd78d4e3a77cb3ae4dc60e19aab6be0ece2ae464", size = 199356881, upload-time = "2024-10-01T17:13:01.861Z" },
 ]
 
+[[package]]
+name = "nvidia-curand-cu12"
+version = "10.3.5.147"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/80/9c/a79180e4d70995fdf030c6946991d0171555c6edf95c265c6b2bf7011112/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9", size = 56314811, upload-time = "2024-06-18T19:34:48.575Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/6d/44ad094874c6f1b9c654f8ed939590bdc408349f137f9b98a3a23ccec411/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b", size = 56305206, upload-time = "2024-04-03T20:58:08.722Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/22/2573503d0d4e45673c263a313f79410e110eb562636b0617856fdb2ff5f6/nvidia_curand_cu12-10.3.5.147-py3-none-win_amd64.whl", hash = "sha256:f307cc191f96efe9e8f05a87096abc20d08845a841889ef78cb06924437f6771", size = 55799918, upload-time = "2024-04-03T21:04:34.45Z" },
+]
+
 [[package]]
 name = "nvidia-curand-cu12"
 version = "10.3.7.77"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/42/ac/36543605358a355632f1a6faa3e2d5dfb91eab1e4bc7d552040e0383c335/nvidia_curand_cu12-10.3.7.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:6e82df077060ea28e37f48a3ec442a8f47690c7499bff392a5938614b56c98d8", size = 56289881, upload-time = "2024-10-01T17:04:18.981Z" },
     { url = "https://files.pythonhosted.org/packages/73/1b/44a01c4e70933637c93e6e1a8063d1e998b50213a6b65ac5a9169c47e98e/nvidia_curand_cu12-10.3.7.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a42cd1344297f70b9e39a1e4f467a4e1c10f1da54ff7a85c12197f6c652c8bdf", size = 56279010, upload-time = "2024-11-20T17:42:50.958Z" },
@@ -1335,14 +1452,35 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a9/a8/0cd0cec757bd4b4b4ef150fca62ec064db7d08a291dced835a0be7d2c147/nvidia_curand_cu12-10.3.7.77-py3-none-win_amd64.whl", hash = "sha256:6d6d935ffba0f3d439b7cd968192ff068fafd9018dbf1b85b37261b13cfc9905", size = 55783873, upload-time = "2024-10-01T17:13:30.377Z" },
 ]
 
+[[package]]
+name = "nvidia-cusolver-cu12"
+version = "11.6.1.9"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
+dependencies = [
+    { name = "nvidia-cublas-cu12", version = "12.4.5.8", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparse-cu12", version = "12.3.1.170", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/46/6b/a5c33cf16af09166845345275c34ad2190944bcc6026797a39f8e0a282e0/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e", size = 127634111, upload-time = "2024-06-18T19:35:01.793Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/e1/5b9089a4b2a4790dfdea8b3a006052cfecff58139d5a4e34cb1a51df8d6f/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260", size = 127936057, upload-time = "2024-04-03T20:58:28.735Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/be/d435b7b020e854d5d5a682eb5de4328fd62f6182507406f2818280e206e2/nvidia_cusolver_cu12-11.6.1.9-py3-none-win_amd64.whl", hash = "sha256:e77314c9d7b694fcebc84f58989f3aa4fb4cb442f12ca1a9bde50f5e8f6d1b9c", size = 125224015, upload-time = "2024-04-03T21:04:53.339Z" },
+]
+
 [[package]]
 name = "nvidia-cusolver-cu12"
 version = "11.7.1.2"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
 dependencies = [
-    { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cublas-cu12", version = "12.6.4.1", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparse-cu12", version = "12.5.4.2", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", version = "12.6.85", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/93/17/dbe1aa865e4fdc7b6d4d0dd308fdd5aaab60f939abfc0ea1954eac4fb113/nvidia_cusolver_cu12-11.7.1.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0ce237ef60acde1efc457335a2ddadfd7610b892d94efee7b776c64bb1cac9e0", size = 157833628, upload-time = "2024-10-01T17:05:05.591Z" },
@@ -1352,12 +1490,31 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d4/53/fff50a0808df7113d77e3bbc7c2b7eaed6f57d5eb80fbe93ead2aea1e09a/nvidia_cusolver_cu12-11.7.1.2-py3-none-win_amd64.whl", hash = "sha256:6813f9d8073f555444a8705f3ab0296d3e1cb37a16d694c5fc8b862a0d8706d7", size = 149287877, upload-time = "2024-10-01T17:13:49.804Z" },
 ]
 
+[[package]]
+name = "nvidia-cusparse-cu12"
+version = "12.3.1.170"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
+dependencies = [
+    { name = "nvidia-nvjitlink-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/96/a9/c0d2f83a53d40a4a41be14cea6a0bf9e668ffcf8b004bd65633f433050c0/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3", size = 207381987, upload-time = "2024-06-18T19:35:32.989Z" },
+    { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763, upload-time = "2024-04-03T20:58:59.995Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/e0/3155ca539760a8118ec94cc279b34293309bcd14011fc724f87f31988843/nvidia_cusparse_cu12-12.3.1.170-py3-none-win_amd64.whl", hash = "sha256:9bc90fb087bc7b4c15641521f31c0371e9a612fc2ba12c338d3ae032e6b6797f", size = 204684315, upload-time = "2024-04-03T21:05:26.031Z" },
+]
+
 [[package]]
 name = "nvidia-cusparse-cu12"
 version = "12.5.4.2"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", version = "12.6.85", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/eb/eb/6681efd0aa7df96b4f8067b3ce7246833dd36830bb4cec8896182773db7d/nvidia_cusparse_cu12-12.5.4.2-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d25b62fb18751758fe3c93a4a08eff08effedfe4edf1c6bb5afd0890fe88f887", size = 216451147, upload-time = "2024-11-20T17:44:18.055Z" },
@@ -1367,10 +1524,26 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/45/ef/876ad8e4260e1128e6d4aac803d9d51baf3791ebdb4a9b8d9b8db032b4b0/nvidia_cusparse_cu12-12.5.4.2-py3-none-win_amd64.whl", hash = "sha256:4acb8c08855a26d737398cba8fb6f8f5045d93f82612b4cfd84645a2332ccf20", size = 213712630, upload-time = "2024-10-01T17:14:23.779Z" },
 ]
 
+[[package]]
+name = "nvidia-cusparselt-cu12"
+version = "0.6.2"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/98/8e/675498726c605c9441cf46653bd29cb1b8666da1fb1469ffa25f67f20c58/nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:067a7f6d03ea0d4841c85f0c6f1991c5dda98211f6302cb83a4ab234ee95bef8", size = 149422781, upload-time = "2024-07-23T17:35:27.203Z" },
+    { url = "https://files.pythonhosted.org/packages/78/a8/bcbb63b53a4b1234feeafb65544ee55495e1bb37ec31b999b963cbccfd1d/nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:df2c24502fd76ebafe7457dbc4716b2fec071aabaed4fb7691a201cde03704d9", size = 150057751, upload-time = "2024-07-23T02:35:53.074Z" },
+    { url = "https://files.pythonhosted.org/packages/56/8f/2c33082238b6c5e783a877dc8786ab62619e3e6171c083bd3bba6e3fe75e/nvidia_cusparselt_cu12-0.6.2-py3-none-win_amd64.whl", hash = "sha256:0057c91d230703924c0422feabe4ce768841f9b4b44d28586b6f6d2eb86fbe70", size = 148755794, upload-time = "2024-07-23T02:35:00.261Z" },
+]
+
 [[package]]
 name = "nvidia-cusparselt-cu12"
 version = "0.6.3"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/62/da/4de092c61c6dea1fc9c936e69308a02531d122e12f1f649825934ad651b5/nvidia_cusparselt_cu12-0.6.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:8371549623ba601a06322af2133c4a44350575f5a3108fb75f3ef20b822ad5f1", size = 156402859, upload-time = "2024-10-16T02:23:17.184Z" },
     { url = "https://files.pythonhosted.org/packages/3b/9a/72ef35b399b0e183bc2e8f6f558036922d453c4d8237dab26c666a04244b/nvidia_cusparselt_cu12-0.6.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:e5c8a26c36445dd2e6812f1177978a24e2d37cacce7e090f297a688d1ec44f46", size = 156785796, upload-time = "2024-10-15T21:29:17.709Z" },
@@ -1394,20 +1567,52 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/df/99/12cd266d6233f47d00daf3a72739872bdc10267d0383508b0b9c84a18bb6/nvidia_nccl_cu12-2.21.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:8579076d30a8c24988834445f8d633c697d42397e92ffc3f63fa26766d25e0a0", size = 188654414, upload-time = "2024-04-03T15:32:57.427Z" },
 ]
 
+[[package]]
+name = "nvidia-nvjitlink-cu12"
+version = "12.4.127"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/02/45/239d52c05074898a80a900f49b1615d81c07fceadd5ad6c4f86a987c0bc4/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83", size = 20552510, upload-time = "2024-06-18T20:20:13.871Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/ff/847841bacfbefc97a00036e0fce5a0f086b640756dc38caea5e1bb002655/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57", size = 21066810, upload-time = "2024-04-03T20:59:46.957Z" },
+    { url = "https://files.pythonhosted.org/packages/81/19/0babc919031bee42620257b9a911c528f05fb2688520dcd9ca59159ffea8/nvidia_nvjitlink_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:fd9020c501d27d135f983c6d3e244b197a7ccad769e34df53a42e276b0e25fa1", size = 95336325, upload-time = "2024-04-03T21:06:25.073Z" },
+]
+
 [[package]]
 name = "nvidia-nvjitlink-cu12"
 version = "12.6.85"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/9d/d7/c5383e47c7e9bf1c99d5bd2a8c935af2b6d705ad831a7ec5c97db4d82f4f/nvidia_nvjitlink_cu12-12.6.85-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:eedc36df9e88b682efe4309aa16b5b4e78c2407eac59e8c10a6a47535164369a", size = 19744971, upload-time = "2024-11-20T17:46:53.366Z" },
     { url = "https://files.pythonhosted.org/packages/31/db/dc71113d441f208cdfe7ae10d4983884e13f464a6252450693365e166dcf/nvidia_nvjitlink_cu12-12.6.85-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cf4eaa7d4b6b543ffd69d6abfb11efdeb2db48270d94dfd3a452c24150829e41", size = 19270338, upload-time = "2024-11-20T17:46:29.758Z" },
     { url = "https://files.pythonhosted.org/packages/89/76/93c1467b1387387440a4d25102d86b7794535449b689f8e2dc22c1c8ff7f/nvidia_nvjitlink_cu12-12.6.85-py3-none-win_amd64.whl", hash = "sha256:e61120e52ed675747825cdd16febc6a0730537451d867ee58bee3853b1b13d1c", size = 161908572, upload-time = "2024-11-20T17:52:40.124Z" },
 ]
 
+[[package]]
+name = "nvidia-nvtx-cu12"
+version = "12.4.127"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/06/39/471f581edbb7804b39e8063d92fc8305bdc7a80ae5c07dbe6ea5c50d14a5/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3", size = 100417, upload-time = "2024-06-18T20:16:22.484Z" },
+    { url = "https://files.pythonhosted.org/packages/87/20/199b8713428322a2f22b722c62b8cc278cc53dffa9705d744484b5035ee9/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a", size = 99144, upload-time = "2024-04-03T20:56:12.406Z" },
+    { url = "https://files.pythonhosted.org/packages/54/1b/f77674fbb73af98843be25803bbd3b9a4f0a96c75b8d33a2854a5c7d2d77/nvidia_nvtx_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:641dccaaa1139f3ffb0d3164b4b84f9d253397e38246a4f2f36728b48566d485", size = 66307, upload-time = "2024-04-03T21:02:01.959Z" },
+]
+
 [[package]]
 name = "nvidia-nvtx-cu12"
 version = "12.6.77"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "platform_machine == 'x86_64' and sys_platform == 'linux'",
+]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/b9/93/80f8a520375af9d7ee44571a6544653a176e53c2b8ccce85b97b83c2491b/nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f44f8d86bb7d5629988d61c8d3ae61dddb2015dee142740536bc7481b022fe4b", size = 90549, upload-time = "2024-11-20T17:38:17.387Z" },
     { url = "https://files.pythonhosted.org/packages/2b/53/36e2fd6c7068997169b49ffc8c12d5af5e5ff209df6e1a2c4d373b3a638f/nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:adcaabb9d436c9761fca2b13959a2d237c5f9fd406c8e4b723c695409ff88059", size = 90539, upload-time = "2024-10-01T17:00:27.179Z" },
@@ -2078,20 +2283,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e6/eb/3bf6ea8ab7f1503dca3a10df2e4b9c3f6b3316df07f6c0ded94b281c7101/scipy-1.15.3-cp312-cp312-win_amd64.whl", hash = "sha256:52092bc0472cfd17df49ff17e70624345efece4e1a12b23783a1ac59a1b728ed", size = 40966184, upload-time = "2025-05-08T16:06:52.623Z" },
 ]
 
-[[package]]
-name = "seaborn"
-version = "0.13.2"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "matplotlib", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" },
-    { name = "numpy", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" },
-    { name = "pandas", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/86/59/a451d7420a77ab0b98f7affa3a1d78a313d2f7281a57afb1a34bae8ab412/seaborn-0.13.2.tar.gz", hash = "sha256:93e60a40988f4d65e9f4885df477e2fdaff6b73a9ded434c1ab356dd57eefff7", size = 1457696, upload-time = "2024-01-25T13:21:52.551Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/83/11/00d3c3dfc25ad54e731d91449895a79e4bf2384dc3ac01809010ba88f6d5/seaborn-0.13.2-py3-none-any.whl", hash = "sha256:636f8336facf092165e27924f223d3c62ca560b1f2bb5dff7ab7fad265361987", size = 294914, upload-time = "2024-01-25T13:21:49.598Z" },
-]
-
 [[package]]
 name = "semantic-version"
 version = "2.10.0"
@@ -2235,8 +2426,8 @@ wheels = [
 
 [[package]]
 name = "torch"
-version = "2.6.0+cpu"
-source = { registry = "https://download.pytorch.org/whl/cpu" }
+version = "2.6.0"
+source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
     "platform_machine == 'aarch64' and sys_platform == 'linux'",
     "platform_machine == 'x86_64' and sys_platform == 'linux'",
@@ -2246,14 +2437,29 @@ dependencies = [
     { name = "fsspec", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "jinja2", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "networkx", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "nvidia-cublas-cu12", version = "12.4.5.8", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-cupti-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-nvrtc-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-runtime-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cudnn-cu12", version = "9.1.0.70", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cufft-cu12", version = "11.2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-curand-cu12", version = "10.3.5.147", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusolver-cu12", version = "11.6.1.9", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparse-cu12", version = "12.3.1.170", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparselt-cu12", version = "0.6.2", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvtx-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "setuptools", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
     { name = "sympy", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "typing-extensions", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
 ]
 wheels = [
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp312-cp312-linux_x86_64.whl", hash = "sha256:59e78aa0c690f70734e42670036d6b541930b8eabbaa18d94e090abf14cc4d91" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:318290e8924353c61b125cdc8768d15208704e279e7757c113b9620740deca98" },
-    { url = "https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp312-cp312-win_amd64.whl", hash = "sha256:4027d982eb2781c93825ab9527f17fbbb12dbabf422298e4b954be60016f87d8" },
+    { url = "https://files.pythonhosted.org/packages/e5/35/0c52d708144c2deb595cd22819a609f78fdd699b95ff6f0ebcd456e3c7c1/torch-2.6.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:2bb8987f3bb1ef2675897034402373ddfc8f5ef0e156e2d8cfc47cacafdda4a9", size = 766624563, upload-time = "2025-01-29T16:23:19.084Z" },
+    { url = "https://files.pythonhosted.org/packages/01/d6/455ab3fbb2c61c71c8842753b566012e1ed111e7a4c82e0e1c20d0c76b62/torch-2.6.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b789069020c5588c70d5c2158ac0aa23fd24a028f34a8b4fcb8fcb4d7efcf5fb", size = 95607867, upload-time = "2025-01-29T16:25:55.649Z" },
+    { url = "https://files.pythonhosted.org/packages/18/cf/ae99bd066571656185be0d88ee70abc58467b76f2f7c8bfeb48735a71fe6/torch-2.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:7e1448426d0ba3620408218b50aa6ada88aeae34f7a239ba5431f6c8774b1239", size = 204120469, upload-time = "2025-01-29T16:24:01.821Z" },
+    { url = "https://files.pythonhosted.org/packages/81/b4/605ae4173aa37fb5aa14605d100ff31f4f5d49f617928c9f486bb3aaec08/torch-2.6.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:9a610afe216a85a8b9bc9f8365ed561535c93e804c2a317ef7fabcc5deda0989", size = 66532538, upload-time = "2025-01-29T16:24:18.976Z" },
 ]
 
 [[package]]
@@ -2302,19 +2508,19 @@ dependencies = [
     { name = "fsspec", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "jinja2", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "networkx", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cublas-cu12", version = "12.6.4.1", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-cupti-cu12", version = "12.6.80", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-nvrtc-cu12", version = "12.6.77", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cuda-runtime-cu12", version = "12.6.77", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cudnn-cu12", version = "9.5.1.17", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cufft-cu12", version = "11.3.0.4", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-curand-cu12", version = "10.3.7.77", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusolver-cu12", version = "11.7.1.2", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparse-cu12", version = "12.5.4.2", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-cusparselt-cu12", version = "0.6.3", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
-    { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink-cu12", version = "12.6.85", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
+    { name = "nvidia-nvtx-cu12", version = "12.6.77", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "setuptools", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "sympy", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
     { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
@@ -2481,7 +2687,6 @@ dependencies = [
     { name = "polars", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" },
     { name = "psutil", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" },
     { name = "pynvml", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" },
-    { name = "seaborn", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" },
     { name = "tqdm", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" },
     { name = "weathergen-common", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" },
     { name = "weathergen-evaluate", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" },
@@ -2491,7 +2696,7 @@ dependencies = [
 
 [package.optional-dependencies]
 cpu = [
-    { name = "torch", version = "2.6.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
+    { name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" },
 ]
 gpu = [
     { name = "flash-attn", version = "2.7.3", source = { url = "https://object-store.os-api.cci1.ecmwf.int/weathergenerator-dev/wheels/flash_attn-2.7.3-cp312-cp312-linux_aarch64.whl" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'extra-10-weathergen-gpu') or (platform_machine != 'aarch64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" },
@@ -2530,12 +2735,11 @@ requires-dist = [
     { name = "polars", specifier = "~=1.25.2" },
     { name = "psutil" },
     { name = "pynvml" },
-    { name = "seaborn", specifier = ">=0.13.2" },
     { name = "torch", marker = "platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'gpu'", url = "https://download.pytorch.org/whl/cu126/torch-2.6.0%2Bcu126-cp312-cp312-linux_aarch64.whl" },
     { name = "torch", marker = "platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'gpu'", url = "https://download.pytorch.org/whl/cu126/torch-2.6.0%2Bcu126-cp312-cp312-manylinux_2_28_x86_64.whl" },
-    { name = "torch", marker = "sys_platform == 'linux' and extra == 'cpu'", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cpu", conflict = { package = "weathergen", extra = "cpu" } },
-    { name = "torch", marker = "sys_platform != 'linux' and extra == 'cpu'", specifier = "==2.6.0" },
     { name = "torch", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'gpu') or (sys_platform != 'linux' and extra == 'gpu')", specifier = "==2.6.0+cu126" },
+    { name = "torch", marker = "sys_platform == 'macosx' and extra == 'cpu'", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cpu", conflict = { package = "weathergen", extra = "cpu" } },
+    { name = "torch", marker = "sys_platform != 'macosx' and extra == 'cpu'", specifier = "==2.6.0" },
     { name = "tqdm" },
     { name = "weathergen-common", editable = "packages/common" },
     { name = "weathergen-evaluate", editable = "packages/evaluate" },

From da92f8fd67fabaee8de479893b61d4a43044b5f3 Mon Sep 17 00:00:00 2001
From: Julian Kuehnert <julian.b.kuehnert@gmail.com>
Date: Mon, 13 Oct 2025 13:26:31 +0000
Subject: [PATCH 11/19] revert to default confit

---
 config/default_config.yml         | 30 ++++++++++++++----------------
 config/streams/era5_1deg/era5.yml |  2 +-
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/config/default_config.yml b/config/default_config.yml
index d67d5359e..5ab614cf7 100644
--- a/config/default_config.yml
+++ b/config/default_config.yml
@@ -10,7 +10,7 @@ embed_dropout_rate: 0.1
 target_cell_local_prediction: True
 
 ae_local_dim_embed: 1024
-ae_local_num_blocks: 0
+ae_local_num_blocks: 2
 ae_local_num_heads: 16
 ae_local_dropout_rate: 0.1
 ae_local_with_qk_lnorm: True
@@ -24,7 +24,7 @@ ae_adapter_with_residual: True
 ae_adapter_dropout_rate: 0.1
 
 ae_global_dim_embed: 2048
-ae_global_num_blocks: 4
+ae_global_num_blocks: 8
 ae_global_num_heads: 32
 ae_global_dropout_rate: 0.1
 ae_global_with_qk_lnorm: True
@@ -36,19 +36,18 @@ ae_global_mlp_hidden_factor: 2
 
 decoder_type: PerceiverIOCoordConditioning # CrossAttentionAdaNormConditioning
 pred_adapter_kv: False
-pred_self_attention: False
+pred_self_attention: True
 pred_dyadic_dims: False
 pred_mlp_adaln: True
 
 # number of steps offset applied to first target window; if set to zero and forecast_steps=0 then
 # one is training an auto-encoder
-forecast_offset : 1
+forecast_offset : 0
 forecast_delta_hrs: 0
-forecast_steps: 2
-forecast_policy: "fixed"
-forecast_freeze_model: False
+forecast_steps: 0
+forecast_policy: null
 forecast_att_dense_rate: 1.0
-fe_num_blocks: 8
+fe_num_blocks: 0
 fe_num_heads: 16
 fe_dropout_rate: 0.1
 fe_with_qk_lnorm: True
@@ -88,7 +87,7 @@ freeze_modules: ""
 
 # training mode: "forecast" or "masking" (masked token modeling)
 # for "masking" to train with auto-encoder mode, forecast_offset should be 0
-training_mode: "forecast"
+training_mode: "masking"
 # masking rate when training mode is "masking"; ignored in foreacast mode
 masking_rate: 0.6
 # sample the masking rate (with normal distribution centered at masking_rate)
@@ -96,7 +95,7 @@ masking_rate: 0.6
 masking_rate_sampling: True
 # sample a subset of all target points, useful e.g. to reduce memory requirements (also can specify per-stream)
 sampling_rate_target: 1.0
-# include a masking strategy here, currently only supporting "random", "block", "healpix", "channel", "combination"
+# include a masking strategy here, currently only supporting "random", "block", "healpix", "channel", "causal" and "combination"
 masking_strategy: "random"
 # masking_strategy_config is a dictionary of additional parameters for the masking strategy
 # required for "healpix" and "channel" masking strategies
@@ -108,17 +107,17 @@ masking_strategy_config: {"strategies": ["random", "healpix", "channel"],
                           "same_strategy_per_batch": false
                           }
 
-num_epochs: 64
+num_epochs: 32
 samples_per_epoch: 4096
 samples_per_validation: 512
 shuffle: True
 
 lr_scaling_policy: "sqrt"
 lr_start: 1e-6
-lr_max: 0.0001
-lr_final_decay: 2e-6
+lr_max: 5e-5
+lr_final_decay: 1e-6
 lr_final: 0.0
-lr_steps_warmup: 256
+lr_steps_warmup: 512 
 lr_steps_cooldown: 512
 lr_policy_warmup: "cosine"
 lr_policy_decay: "linear"
@@ -128,7 +127,6 @@ grad_clip: 1.0
 weight_decay: 0.1
 norm_type: "LayerNorm"
 nn_module: "te"
-log_grad_norms: True
 
 start_date: 197901010000
 end_date: 202012310000
@@ -154,4 +152,4 @@ run_id: ???
 # Parameters for logging/printing in the training loop
 train_log:
   # The period to log metrics (in number of batch steps)
-  log_interval: 20
\ No newline at end of file
+  log_interval: 20
diff --git a/config/streams/era5_1deg/era5.yml b/config/streams/era5_1deg/era5.yml
index 85ac8a8ca..bb2234c4e 100644
--- a/config/streams/era5_1deg/era5.yml
+++ b/config/streams/era5_1deg/era5.yml
@@ -29,7 +29,7 @@ ERA5 :
     dim_embed : 256
   target_readout :
     type : 'obs_value'  # token or obs_value
-    num_layers : 1
+    num_layers : 2
     num_heads : 4
     # sampling_rate : 0.2
   pred_head :

From a072c35973ccc445f852445355caf54d92c7dc47 Mon Sep 17 00:00:00 2001
From: Julian Kuehnert <julian.b.kuehnert@gmail.com>
Date: Mon, 13 Oct 2025 13:54:43 +0000
Subject: [PATCH 12/19] add comment on FSDP2 specifics

---
 src/weathergen/train/trainer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py
index 83515d317..cb29bf68e 100644
--- a/src/weathergen/train/trainer.py
+++ b/src/weathergen/train/trainer.py
@@ -926,9 +926,10 @@ def _log(self, stage: Stage):
     def _log_instant_grad_norms(self, stage: Stage, total_norm):
         """
         Log instantaneous grad norms, we do not average because of the cost and because we want to
-        measure the actual values
+        measure the actual values.
 
-        TODO test DDP case
+        Note: When using FSDP2, we need full_tensor().item() instead of .item(), see here:
+        https://gist.github.com/Kai-46/a9835ef3f36e76d06afee6c11f388144
         """
         self.last_grad_norm = (
             total_norm.full_tensor().item() if self.cf.world_size > 1 else total_norm.item()

From c8fadf6a13b41c5fd7b65842af445fc6d2fc38be Mon Sep 17 00:00:00 2001
From: Jubeku <julian.kuehnert@ecmwf.int>
Date: Thu, 16 Oct 2025 14:59:48 +0200
Subject: [PATCH 13/19] move plot grad script to private repo

---
 src/weathergen/utils/plot_grad_norms.py | 525 ------------------------
 1 file changed, 525 deletions(-)
 delete mode 100644 src/weathergen/utils/plot_grad_norms.py

diff --git a/src/weathergen/utils/plot_grad_norms.py b/src/weathergen/utils/plot_grad_norms.py
deleted file mode 100644
index ec310c0fc..000000000
--- a/src/weathergen/utils/plot_grad_norms.py
+++ /dev/null
@@ -1,525 +0,0 @@
-import json
-import re
-from pathlib import Path
-
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-
-# ruff: noqa: T201
-
-
-class GradientNormsAnalyzer:
-    def __init__(self, json_file_path):
-        """
-        Initialize the analyzer with path to JSON file containing gradient norms.
-        Expected format: one JSON object per line with step info and gradient norms.
-        """
-        self.json_file_path = Path(json_file_path)
-        self.data = []
-        self.df = None
-        self.load_data()
-
-    def load_data(self):
-        """Load and parse the JSON data from file."""
-        print(f"Loading data from {self.json_file_path}...")
-
-        with open(self.json_file_path) as f:
-            for line_num, line in enumerate(f, 1):
-                try:
-                    data_point = json.loads(line.strip())
-                    self.data.append(data_point)
-                except json.JSONDecodeError as e:
-                    print(f"Warning: Could not parse line {line_num}: {e}")
-
-        print(f"Loaded {len(self.data)} data points")
-        self.create_dataframe()
-
-    def create_dataframe(self):
-        """Convert loaded data into a pandas DataFrame for easier analysis."""
-        rows = []
-
-        for ith, entry in enumerate(self.data):
-            # step = entry.get('num_samples', entry.get('epoch', 0))
-            step = ith * 5
-
-            # Handle different possible data structures
-            if "gradients" in entry:
-                grad_data = entry["gradients"]
-            elif "grad_norms" in entry:
-                grad_data = entry["grad_norms"]
-            else:
-                # Assume all keys except step/epoch are gradient data
-                grad_data = {
-                    k: v for k, v in entry.items() if "stream" not in k and ("grad_norm" in k)
-                }
-
-            for param_name, norm_value in grad_data.items():
-                rows.append(
-                    {
-                        "num_samples": step,
-                        "parameter": param_name,
-                        "grad_norm": float(norm_value),
-                        "layer_type": self.extract_layer_type(param_name),
-                        "layer_depth": self.extract_layer_depth(param_name),
-                    }
-                )
-
-        self.df = pd.DataFrame(rows)
-        print(f"Created DataFrame with {len(self.df)} gradient norm records")
-
-    def extract_layer_type(self, param_name):
-        """Extract layer type from parameter name."""
-        param_name_lower = param_name.lower()[10:]
-
-        # Handle your specific naming patterns
-        if param_name_lower.startswith("embeds."):
-            if ".embed." in param_name_lower:
-                return "embedding"
-            elif ".unembed." in param_name_lower:
-                return "unembedding"
-            elif ".ln_final." in param_name_lower:
-                return "layer_norm_final"
-            elif "proj_heads_q" in param_name_lower:
-                return "attention_q"
-            elif "proj_heads_k" in param_name_lower:
-                return "attention_k"
-            elif "proj_heads_v" in param_name_lower:
-                return "attention_v"
-            elif "proj_out" in param_name_lower:
-                return "attention_out"
-            elif ".layers." in param_name_lower and (
-                "weight" in param_name_lower or "bias" in param_name_lower
-            ):
-                return "ffn"
-            else:
-                return "embeds_other"
-
-        elif param_name_lower.startswith("ae_local_blocks."):
-            if "proj_heads_q" in param_name_lower:
-                return "ae_local_attention_q"
-            elif "proj_heads_k" in param_name_lower:
-                return "ae_local_attention_k"
-            elif "proj_heads_v" in param_name_lower:
-                return "ae_local_attention_v"
-            elif "proj_out" in param_name_lower:
-                return "ae_local_attention_out"
-            elif ".layers." in param_name_lower:
-                return "ae_local_ffn"
-            else:
-                return "ae_local_other"
-
-        elif param_name_lower.startswith("ae_global_blocks."):
-            if "proj_heads_q" in param_name_lower:
-                return "ae_global_attention_q"
-            elif "proj_heads_k" in param_name_lower:
-                return "ae_global_attention_k"
-            elif "proj_heads_v" in param_name_lower:
-                return "ae_global_attention_v"
-            elif "proj_out" in param_name_lower:
-                return "ae_global_attention_out"
-            elif ".layers." in param_name_lower:
-                return "ae_global_ffn"
-            else:
-                return "ae_global_other"
-
-        elif param_name_lower.startswith("ae_adapter."):
-            if "proj_heads_q" in param_name_lower:
-                return "ae_adapter_attention_q"
-            elif "proj_heads_k" in param_name_lower:
-                return "ae_adapter_attention_k"
-            elif "proj_heads_v" in param_name_lower:
-                return "ae_adapter_attention_v"
-            elif "proj_out" in param_name_lower:
-                return "ae_adapter_attention_out"
-            elif ".layers." in param_name_lower:
-                return "ae_adapter_ffn"
-            else:
-                return "ae_adapter_other"
-
-        elif param_name_lower.startswith("target_token_engines."):
-            if "proj_heads_q" in param_name_lower:
-                return "tte_attention_q"
-            elif "proj_heads_k" in param_name_lower:
-                return "tte_attention_k"
-            elif "proj_heads_v" in param_name_lower:
-                return "tte_attention_v"
-            elif "proj_out" in param_name_lower:
-                return "tte_attention_out"
-            elif "embed_aux" in param_name_lower:
-                return "tte_embed_aux"
-            elif "lnorm" in param_name_lower:
-                return "tte_layer_norm"
-            elif ".layers." in param_name_lower:
-                return "tte_ffn"
-            else:
-                return "tte_other"
-
-        elif param_name_lower.startswith("embed_target_coords."):
-            return "target_coords_embedding"
-
-        elif param_name_lower.startswith("pred_heads."):
-            return "prediction_head"
-
-        # Fallback for standard patterns (if any)
-        elif "embed" in param_name_lower:
-            return "embedding"
-        elif "attention" in param_name_lower or "attn" in param_name_lower:
-            if "q_proj" in param_name_lower or "query" in param_name_lower:
-                return "attention_q"
-            elif "k_proj" in param_name_lower or "key" in param_name_lower:
-                return "attention_k"
-            elif "v_proj" in param_name_lower or "value" in param_name_lower:
-                return "attention_v"
-            elif "o_proj" in param_name_lower or "out" in param_name_lower:
-                return "attention_out"
-            else:
-                return "attention"
-        elif (
-            "layernorm" in param_name_lower
-            or "layer_norm" in param_name_lower
-            or "ln" in param_name_lower
-        ):
-            return "layernorm"
-        else:
-            return "other"
-
-    def extract_layer_depth(self, param_name):
-        """Extract layer depth/index from parameter name."""
-        param_name_lower = param_name.lower()
-
-        # Look for patterns specific to your architecture
-        patterns = [
-            # embeds.0.layers.N.* (transformer layers within embeds)
-            r"grad_norm_embeds\.\d+\.layers\.(\d+)\.",
-            # embeds.0.unembed.N.* (unembedding layers)
-            r"grad_norm_embeds\.\d+\.unembed\.(\d+)\.",
-            # embeds.0.ln_final.N.* (final layer norms)
-            r"grad_norm_embeds\.\d+\.ln_final\.(\d+)\.",
-            # ae_local_blocks.N.* (autoencoder local blocks)
-            r"grad_norm_ae_local_blocks\.(\d+)\.",
-            # ae_global_blocks.N.* (autoencoder global blocks)
-            r"ae_global_blocks\.(\d+)\.",
-            # ae_adapter.N.* (autoencoder adapter blocks)
-            r"ae_adapter\.(\d+)\.",
-            # target_token_engines.0.tte.N.* (target token engine blocks)
-            r"target_token_engines\.\d+\.tte\.(\d+)\.",
-            # target_token_engines.0.tte.N.block.M.* (nested blocks)
-            r"target_token_engines\.\d+\.tte\.(\d+)\.block\.(\d+)\.",
-            # pred_heads.0.pred_heads.0.N.* (prediction head layers)
-            r"pred_heads\.\d+\.pred_heads\.\d+\.(\d+)\.",
-            # Generic patterns for any numbered layers
-            r"layer[s]?\.(\d+)",
-            r"h\.(\d+)",
-            r"transformer\.(\d+)",
-            r"blocks\.(\d+)",
-        ]
-
-        for pattern in patterns:
-            match = re.search(pattern, param_name_lower)
-            if match:
-                # For nested patterns (like tte blocks), combine indices
-                if len(match.groups()) > 1:
-                    # Combine indices: e.g., tte.1.block.2 -> 12 (or 1*10+2)
-                    return int(match.group(1)) * 10 + int(match.group(2))
-                else:
-                    return int(match.group(1))
-
-        # Special handling for components without clear depth
-        if param_name_lower.startswith("embed_target_coords."):
-            return 0  # Coordinate embeddings at the start
-        elif "total_grad_norm" in param_name_lower:
-            return -2  # Special marker for total norm
-        elif any(x in param_name_lower for x in ["weathergen", "stage", "q_cells"]):
-            return -3  # Special marker for metadata
-
-        return -1  # Unknown depth
-
-    def plot_total_gradient_norms(self, figsize=(12, 6)):
-        """Plot total gradient norm over training steps."""
-        # Calculate total norm per step
-        total_norms = []
-        steps = []
-
-        for ith, entry in enumerate(self.data):
-            # step = entry.get('num_samples', entry.get('epoch', 0))
-            step = ith * 5
-
-            if "gradients" in entry:
-                grad_data = entry["gradients"]
-            elif "grad_norms" in entry:
-                grad_data = entry["grad_norms"]
-            else:
-                grad_data = {k: v for k, v in entry.items() if "grad_norm" in k}
-
-            if len(grad_data) == 0:
-                continue
-
-            # Calculate total norm (L2 norm of all gradients)
-            total_norm = np.sqrt(sum(float(v) ** 2 for v in grad_data.values()))
-            total_norms.append(total_norm)
-            steps.append(step)
-
-        plt.figure(figsize=figsize)
-        plt.plot(steps, total_norms, linewidth=1.5, alpha=0.8)
-        plt.xlabel("Training Step")
-        plt.ylabel("Total Gradient Norm")
-        plt.title("Total Gradient Norm vs Training Steps")
-        plt.yscale("log")
-        plt.grid(True, alpha=0.3)
-        plt.tight_layout()
-        plt.savefig("plots/total_grad_norm.png")
-
-        return steps, total_norms
-
-    def plot_layer_type_norms(self, figsize=(14, 8)):
-        """Plot gradient norms grouped by layer type."""
-        if self.df is None:
-            print("No DataFrame available. Load data first.")
-            return
-
-        plt.figure(figsize=figsize)
-
-        # Get unique layer types
-        layer_types = self.df["layer_type"].unique()
-        print(layer_types)
-        colors = plt.cm.tab10(np.linspace(0, 1, len(layer_types)))
-
-        for i, layer_type in enumerate(layer_types):
-            layer_data = self.df[self.df["layer_type"] == layer_type]
-
-            # Calculate mean gradient norm per step for this layer type
-            mean_norms = layer_data.groupby("num_samples")["grad_norm"].mean()
-
-            plt.plot(
-                mean_norms.index, mean_norms.values, label=layer_type, color=colors[i], alpha=0.8
-            )
-
-        plt.xlabel("Training Step")
-        plt.ylabel("Mean Gradient Norm")
-        plt.title("Gradient Norms by Layer Type")
-        plt.yscale("log")
-        plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
-        plt.grid(True, alpha=0.3)
-        plt.tight_layout()
-        plt.savefig("plots/grad_norm_by_layer_type.png")
-
-    def plot_layer_depth_analysis(self, figsize=(12, 8)):
-        """Plot gradient norms by layer depth."""
-        if self.df is None:
-            print("No DataFrame available. Load data first.")
-            return
-
-        # Filter out unknown depths
-        depth_data = self.df[self.df["layer_depth"] >= 0]
-
-        if len(depth_data) == 0:
-            print("No layer depth information found in parameter names.")
-            return
-
-        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize)
-
-        # Plot 1: Mean gradient norm by depth over time
-        depths = sorted(depth_data["layer_depth"].unique())
-        colors = plt.cm.viridis(np.linspace(0, 1, len(depths)))
-
-        for i, depth in enumerate(depths):
-            layer_data = depth_data[depth_data["layer_depth"] == depth]
-            mean_norms = layer_data.groupby("num_samples")["grad_norm"].mean()
-
-            ax1.plot(
-                mean_norms.index,
-                mean_norms.values,
-                label=f"Layer {depth}",
-                color=colors[i],
-                alpha=0.8,
-            )
-
-        ax1.set_xlabel("Training Step")
-        ax1.set_ylabel("Mean Gradient Norm")
-        ax1.set_title("Gradient Norms by Layer Depth")
-        ax1.set_yscale("log")
-        ax1.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
-        ax1.grid(True, alpha=0.3)
-
-        # Plot 2: Heatmap of gradient norms by depth and step
-        pivot_data = (
-            depth_data.groupby(["num_samples", "layer_depth"])["grad_norm"].mean().unstack()
-        )
-
-        # Sample data if too many steps for readability
-        if len(pivot_data) > 100:
-            sample_idx = np.linspace(0, len(pivot_data) - 1, 100, dtype=int)
-            pivot_data = pivot_data.iloc[sample_idx]
-
-        im = ax2.imshow(
-            pivot_data.T,
-            aspect="auto",
-            cmap="viridis",
-            extent=[
-                pivot_data.index.min(),
-                pivot_data.index.max(),
-                pivot_data.columns.min(),
-                pivot_data.columns.max(),
-            ],
-        )
-        ax2.set_xlabel("Training Step")
-        ax2.set_ylabel("Layer Depth")
-        ax2.set_title("Gradient Norm Heatmap (Layer Depth vs Step)")
-
-        cbar = plt.colorbar(im, ax=ax2)
-        cbar.set_label("Gradient Norm")
-
-        plt.tight_layout()
-        plt.savefig("plots/grad_norm_heatmap.png")
-
-    def plot_gradient_distribution(self, figsize=(15, 10)):
-        """Plot distribution of gradient norms."""
-        if self.df is None:
-            print("No DataFrame available. Load data first.")
-            return
-
-        fig, axes = plt.subplots(2, 2, figsize=figsize)
-
-        # Plot 1: Histogram of all gradient norms
-        axes[0, 0].hist(np.log10(self.df["grad_norm"].values), bins=50, alpha=0.7)
-        axes[0, 0].set_xlabel("Log10(Gradient Norm)")
-        axes[0, 0].set_ylabel("Frequency")
-        axes[0, 0].set_title("Distribution of Gradient Norms (Log Scale)")
-        axes[0, 0].grid(True, alpha=0.3)
-
-        # Plot 2: Box plot by layer type
-        layer_types = self.df["layer_type"].unique()[:10]  # Limit to 10 for readability
-        plot_data = [
-            np.log10(self.df[self.df["layer_type"] == lt]["grad_norm"].values) for lt in layer_types
-        ]
-
-        axes[0, 1].boxplot(plot_data, labels=layer_types)
-        axes[0, 1].set_xlabel("Layer Type")
-        axes[0, 1].set_ylabel("Log10(Gradient Norm)")
-        axes[0, 1].set_title("Gradient Norm Distribution by Layer Type")
-        axes[0, 1].tick_params(axis="x", rotation=45)
-        axes[0, 1].grid(True, alpha=0.3)
-
-        # Plot 3: Gradient norms over time (sample of parameters)
-        sample_params = self.df["parameter"].unique()[:20]  # Sample 20 parameters
-        for param in sample_params:
-            param_data = self.df[self.df["parameter"] == param]
-            axes[1, 0].plot(
-                param_data["num_samples"], param_data["grad_norm"], alpha=0.6, linewidth=0.8
-            )
-
-        axes[1, 0].set_xlabel("Training Step")
-        axes[1, 0].set_ylabel("Gradient Norm")
-        axes[1, 0].set_title("Individual Parameter Gradient Norms (Sample)")
-        axes[1, 0].set_yscale("log")
-        axes[1, 0].grid(True, alpha=0.3)
-
-        # Plot 4: Statistics over time
-        stats_by_step = self.df.groupby("num_samples")["grad_norm"].agg(
-            ["mean", "std", "min", "max"]
-        )
-
-        axes[1, 1].fill_between(
-            stats_by_step.index,
-            stats_by_step["mean"] - stats_by_step["std"],
-            stats_by_step["mean"] + stats_by_step["std"],
-            alpha=0.3,
-            label="±1 std",
-        )
-        axes[1, 1].plot(stats_by_step.index, stats_by_step["mean"], label="Mean", linewidth=2)
-        axes[1, 1].plot(
-            stats_by_step.index, stats_by_step["max"], label="Max", linewidth=1, alpha=0.8
-        )
-        axes[1, 1].plot(
-            stats_by_step.index, stats_by_step["min"], label="Min", linewidth=1, alpha=0.8
-        )
-
-        axes[1, 1].set_xlabel("Training Step")
-        axes[1, 1].set_ylabel("Gradient Norm")
-        axes[1, 1].set_title("Gradient Norm Statistics Over Time")
-        axes[1, 1].set_yscale("log")
-        axes[1, 1].legend()
-        axes[1, 1].grid(True, alpha=0.3)
-
-        plt.tight_layout()
-        plt.savefig("plots/grad_norm_over_time.png")
-
-    def generate_summary_report(self):
-        """Generate a summary report of gradient norm statistics."""
-        if self.df is None:
-            print("No DataFrame available. Load data first.")
-            return
-
-        print("=== GRADIENT NORMS ANALYSIS REPORT ===")
-        print(f"Total data points: {len(self.df)}")
-        print(f"Training steps: {self.df['num_samples'].nunique()}")
-        print(f"Unique parameters: {self.df['parameter'].nunique()}")
-        print()
-
-        print("Overall Statistics:")
-        print(f"Mean gradient norm: {self.df['grad_norm'].mean():.6f}")
-        print(f"Median gradient norm: {self.df['grad_norm'].median():.6f}")
-        print(f"Min gradient norm: {self.df['grad_norm'].min():.6f}")
-        print(f"Max gradient norm: {self.df['grad_norm'].max():.6f}")
-        print()
-
-        print("Statistics by Layer Type:")
-        layer_stats = self.df.groupby("layer_type")["grad_norm"].agg(
-            ["count", "mean", "std", "min", "max"]
-        )
-        print(layer_stats)
-        print()
-
-        # Check for potential issues
-        print("Potential Issues:")
-        very_small = (self.df["grad_norm"] < 1e-6).sum()
-        very_large = (self.df["grad_norm"] > 10.0).sum()
-
-        if very_small > 0:
-            print(f"⚠️  {very_small} gradient norms < 1e-6 (possible vanishing gradients)")
-        if very_large > 0:
-            print(f"⚠️  {very_large} gradient norms > 10.0 (possible exploding gradients)")
-
-        if very_small == 0 and very_large == 0:
-            print("✅ No obvious gradient issues detected")
-
-
-# Usage example
-def analyze_gradient_file(json_file_path):
-    """
-    Main function to analyze gradient norms from a JSON file.
-
-    Usage:
-    analyze_gradient_file('gradient_norms.jsonl')
-    """
-
-    analyzer = GradientNormsAnalyzer(json_file_path)
-
-    # Generate summary report
-    analyzer.generate_summary_report()
-
-    # Create all plots
-    print("\n=== GENERATING PLOTS ===")
-
-    print("1. Total gradient norms over time...")
-    analyzer.plot_total_gradient_norms()
-
-    print("2. Gradient norms by layer type...")
-    analyzer.plot_layer_type_norms()
-
-    print("3. Layer depth analysis...")
-    analyzer.plot_layer_depth_analysis()
-
-    print("4. Gradient distribution analysis...")
-    analyzer.plot_gradient_distribution()
-
-    return analyzer
-
-
-# Example usage:
-# uv run python src/weathergen/utils/plot_grad_norms.py results/yvhxm2jc/yvhxm2jc_train_metrics.json
-if __name__ == "__main__":
-    import sys
-
-    analyzer = analyze_gradient_file(sys.argv[1])

From 8bd73835c044e694d7ab068507513c75a3f12a11 Mon Sep 17 00:00:00 2001
From: Jubeku <julian.kuehnert@ecmwf.int>
Date: Thu, 16 Oct 2025 15:15:11 +0200
Subject: [PATCH 14/19] rm seaborn from pyproject

---
 pyproject.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6a06230aa..80654ec01 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,6 @@ dependencies = [
  "numexpr>=2.11.0",
  "weathergen-common",
  "weathergen-evaluate",
- "seaborn>=0.13.2",
 ]
 
 

From 9892dfaa76c4519f9814fbd94396f40115b39cdf Mon Sep 17 00:00:00 2001
From: Jubeku <julian.kuehnert@ecmwf.int>
Date: Tue, 21 Oct 2025 14:26:26 +0200
Subject: [PATCH 15/19] updating terminal and metrics loggin, add
 get_tensor_item fct

---
 src/weathergen/model/model.py   |  6 ++--
 src/weathergen/train/trainer.py | 55 ++++++++++++++-------------------
 2 files changed, 26 insertions(+), 35 deletions(-)

diff --git a/src/weathergen/model/model.py b/src/weathergen/model/model.py
index 803c0312b..18ec6537b 100644
--- a/src/weathergen/model/model.py
+++ b/src/weathergen/model/model.py
@@ -596,7 +596,7 @@ def forward(self, model_params: ModelParams, batch, forecast_offset: int, foreca
                 if noise_std > 0.0:
                     tokens = tokens + torch.randn_like(tokens) * torch.norm(tokens) * noise_std
 
-            tokens = self.forecast(model_params, tokens)
+            tokens = self.forecast(model_params, tokens, fstep)
 
         # prediction for final step
         preds_all += [
@@ -793,7 +793,7 @@ def assimilate_global(self, model_params: ModelParams, tokens: torch.Tensor) ->
         return tokens
 
     #########################################
-    def forecast(self, model_params: ModelParams, tokens: torch.Tensor) -> torch.Tensor:
+    def forecast(self, model_params: ModelParams, tokens: torch.Tensor, fstep: int) -> torch.Tensor:
         """Advances latent space representation in time
 
         Args:
@@ -806,7 +806,7 @@ def forecast(self, model_params: ModelParams, tokens: torch.Tensor) -> torch.Ten
         """
 
         for it, block in enumerate(self.fe_blocks):
-            aux_info = torch.tensor([it], dtype=torch.float32, device="cuda")
+            aux_info = torch.tensor([fstep], dtype=torch.float32, device="cuda")
             tokens = checkpoint(block, tokens, aux_info, use_reentrant=False)
 
         return tokens
diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py
index f3980a96f..2abe9d921 100644
--- a/src/weathergen/train/trainer.py
+++ b/src/weathergen/train/trainer.py
@@ -556,7 +556,6 @@ def train(self, epoch):
 
         # Unweighted loss, real weighted loss, std for losses that need it
         self.loss_unweighted_hist, self.loss_model_hist, self.stdev_unweighted_hist = [], [], []
-        self.last_grad_norm = 0.0
 
         # training loop
         self.t_start = time.time()
@@ -593,8 +592,11 @@ def train(self, epoch):
             )
 
             # log gradient norms
-            if bidx % log_interval == 0 and self.log_grad_norms:
-                self._log_instant_grad_norms(TRAIN, total_norm)
+            if self.log_grad_norms:
+                if bidx % self.train_log_freq.terminal == 0:
+                    self.last_grad_norm = self._get_tensor_item(total_norm)
+                if bidx % self.train_log_freq.metrics == 0:
+                    self._log_instant_grad_norms(TRAIN, total_norm)
 
             # optimizer step
             self.grad_scaler.step(self.optimizer)
@@ -980,31 +982,25 @@ def _log(self, stage: Stage):
 
         self.loss_unweighted_hist, self.loss_model_hist, self.stdev_unweighted_hist = [], [], []
 
-    def _log_instant_grad_norms(self, stage: Stage, total_norm):
+    def _get_tensor_item(self, tensor):
+        """
+        When using FSDP2, we need full_tensor().item() instead of .item(), see here:
+        https://gist.github.com/Kai-46/a9835ef3f36e76d06afee6c11f388144
+        """
+        return tensor.full_tensor().item() if self.cf.world_size > 1 else tensor.item()
+
+    def _log_instant_grad_norms(self, stage: Stage):
         """
         Log instantaneous grad norms, we do not average because of the cost and because we want to
         measure the actual values.
-
-        Note: When using FSDP2, we need full_tensor().item() instead of .item(), see here:
-        https://gist.github.com/Kai-46/a9835ef3f36e76d06afee6c11f388144
         """
-        self.last_grad_norm = (
-            total_norm.full_tensor().item() if self.cf.world_size > 1 else total_norm.item()
-        )
         grad_norms = {"total_grad_norm": self.last_grad_norm}
         for name, param in self.model.named_parameters():
             if param.grad is not None:
-                # grad_norms["grad_norm_" + name] = param.grad.norm().item()
-                grad_norms["grad_norm_" + name] = (
-                    param.grad.norm().full_tensor().item()
-                    if self.cf.world_size > 1
-                    else param.grad.norm().item()
-                )
+                grad_norms["grad_norm_" + name] = self._get_tensor_item(param.grad.norm())
 
-        # print(".item():", param.grad.norm().item())
-        # print(".full_tensor().item()", param.grad.norm().full_tensor().item())
         if is_root():
-            self.train_logger.log_metrics(TRAIN, grad_norms)
+            self.train_logger.log_metrics(stage, grad_norms)
 
     def _log_terminal(self, bidx: int, epoch: int, stage: Stage):
         print_freq = self.train_log_freq.terminal
@@ -1027,21 +1023,16 @@ def _log_terminal(self, bidx: int, epoch: int, stage: Stage):
                 elif stage == TRAIN:
                     # samples per sec
                     dt = time.time() - self.t_start
-                    pstr = "{:03d} : {:05d}/{:05d} : {:06d} : loss = {:.4E} "
-                    pstr += "(lr={:.2E}, gradient norm={:.3f}, s/sec={:.3f})"
                     len_dataset = len(self.data_loader) // self.cf.batch_size_per_gpu
-                    logger.info(
-                        pstr.format(
-                            epoch,
-                            bidx,
-                            len_dataset,
-                            self.cf.istep,
-                            avg_loss.nanmean().item(),
-                            self.lr_scheduler.get_lr(),
-                            self.last_grad_norm,
-                            (print_freq * self.cf.batch_size_per_gpu) / dt,
-                        ),
+                    pstr = (
+                        f"{epoch:03d} : {bidx:05d}/{len_dataset:05d} : "
+                        + f"{self.cf.istep:06d} : loss = {avg_loss.nanmean().item():.4E} "
+                        + f"(lr={self.lr_scheduler.get_lr():.2E}, "
                     )
+                    if self.log_grad_norms:
+                        pstr += f"gradient norm={self.last_grad_norm:.3f}, "
+                    pstr += f"s/sec={(print_freq * self.cf.batch_size_per_gpu) / dt:.3f})"
+                    logger.info(pstr)
                     logger.info("\t")
                     for _, st in enumerate(self.cf.streams):
                         logger.info(

From 2885062432dd367c120335a1dd45392b4842a796 Mon Sep 17 00:00:00 2001
From: Jubeku <julian.kuehnert@ecmwf.int>
Date: Tue, 21 Oct 2025 16:47:26 +0200
Subject: [PATCH 16/19] check for DTensor instead of world size

---
 src/weathergen/train/trainer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py
index 2abe9d921..b8c189319 100644
--- a/src/weathergen/train/trainer.py
+++ b/src/weathergen/train/trainer.py
@@ -596,7 +596,7 @@ def train(self, epoch):
                 if bidx % self.train_log_freq.terminal == 0:
                     self.last_grad_norm = self._get_tensor_item(total_norm)
                 if bidx % self.train_log_freq.metrics == 0:
-                    self._log_instant_grad_norms(TRAIN, total_norm)
+                    self._log_instant_grad_norms(TRAIN)
 
             # optimizer step
             self.grad_scaler.step(self.optimizer)
@@ -984,10 +984,10 @@ def _log(self, stage: Stage):
 
     def _get_tensor_item(self, tensor):
         """
-        When using FSDP2, we need full_tensor().item() instead of .item(), see here:
-        https://gist.github.com/Kai-46/a9835ef3f36e76d06afee6c11f388144
+        When using FSDP2, tensor is a DTensor and we need full_tensor().item() instead of .item(),
+        see here: https://gist.github.com/Kai-46/a9835ef3f36e76d06afee6c11f388144
         """
-        return tensor.full_tensor().item() if self.cf.world_size > 1 else tensor.item()
+        return tensor.full_tensor().item() if isinstance(tensor, DTensor) else tensor.item()
 
     def _log_instant_grad_norms(self, stage: Stage):
         """

From cbb1c85e88fb8c2ec2d15b768bfedf12e95396e8 Mon Sep 17 00:00:00 2001
From: Jubeku <julian.kuehnert@ecmwf.int>
Date: Tue, 21 Oct 2025 17:33:40 +0200
Subject: [PATCH 17/19] revert forecast fct, fix in separate PR

---
 src/weathergen/model/model.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/weathergen/model/model.py b/src/weathergen/model/model.py
index 18ec6537b..803c0312b 100644
--- a/src/weathergen/model/model.py
+++ b/src/weathergen/model/model.py
@@ -596,7 +596,7 @@ def forward(self, model_params: ModelParams, batch, forecast_offset: int, foreca
                 if noise_std > 0.0:
                     tokens = tokens + torch.randn_like(tokens) * torch.norm(tokens) * noise_std
 
-            tokens = self.forecast(model_params, tokens, fstep)
+            tokens = self.forecast(model_params, tokens)
 
         # prediction for final step
         preds_all += [
@@ -793,7 +793,7 @@ def assimilate_global(self, model_params: ModelParams, tokens: torch.Tensor) ->
         return tokens
 
     #########################################
-    def forecast(self, model_params: ModelParams, tokens: torch.Tensor, fstep: int) -> torch.Tensor:
+    def forecast(self, model_params: ModelParams, tokens: torch.Tensor) -> torch.Tensor:
         """Advances latent space representation in time
 
         Args:
@@ -806,7 +806,7 @@ def forecast(self, model_params: ModelParams, tokens: torch.Tensor, fstep: int)
         """
 
         for it, block in enumerate(self.fe_blocks):
-            aux_info = torch.tensor([fstep], dtype=torch.float32, device="cuda")
+            aux_info = torch.tensor([it], dtype=torch.float32, device="cuda")
             tokens = checkpoint(block, tokens, aux_info, use_reentrant=False)
 
         return tokens

From 75749df7dd146a7ca1bb23e107d21940cf927e32 Mon Sep 17 00:00:00 2001
From: Jubeku <julian.kuehnert@ecmwf.int>
Date: Thu, 23 Oct 2025 10:41:00 +0200
Subject: [PATCH 18/19] rename grad_norm log names to exclude from MLFlow

---
 src/weathergen/train/trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py
index b8c189319..75732cdb4 100644
--- a/src/weathergen/train/trainer.py
+++ b/src/weathergen/train/trainer.py
@@ -994,10 +994,10 @@ def _log_instant_grad_norms(self, stage: Stage):
         Log instantaneous grad norms, we do not average because of the cost and because we want to
         measure the actual values.
         """
-        grad_norms = {"total_grad_norm": self.last_grad_norm}
+        grad_norms = {"grad_norm.total": self.last_grad_norm}
         for name, param in self.model.named_parameters():
             if param.grad is not None:
-                grad_norms["grad_norm_" + name] = self._get_tensor_item(param.grad.norm())
+                grad_norms["grad_norm." + name] = self._get_tensor_item(param.grad.norm())
 
         if is_root():
             self.train_logger.log_metrics(stage, grad_norms)

From f1c24fa1891979e102c46bfd36c38054835a010d Mon Sep 17 00:00:00 2001
From: Jubeku <julian.kuehnert@ecmwf.int>
Date: Fri, 24 Oct 2025 15:32:25 +0200
Subject: [PATCH 19/19] add log_grad_norms to default config

---
 config/default_config.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/config/default_config.yml b/config/default_config.yml
index 679f58dd3..620f5c4ae 100644
--- a/config/default_config.yml
+++ b/config/default_config.yml
@@ -133,6 +133,7 @@ grad_clip: 1.0
 weight_decay: 0.1
 norm_type: "LayerNorm"
 nn_module: "te"
+log_grad_norms: False
 
 start_date: 197901010000
 end_date: 202012310000