From a0039ecfebd8634588a3f5afa44e66f712fa365f Mon Sep 17 00:00:00 2001 From: sophiex <24638638+sophie-xhonneux@users.noreply.github.com> Date: Wed, 6 Aug 2025 12:24:36 +0000 Subject: [PATCH 01/19] Log gradient norms --- src/weathergen/train/trainer.py | 18 ++++++++++++++++-- src/weathergen/utils/train_logger.py | 2 ++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py index 56a28d089..46653b9e9 100644 --- a/src/weathergen/train/trainer.py +++ b/src/weathergen/train/trainer.py @@ -76,6 +76,7 @@ def init( self.init_perf_monitoring() self.train_logger = TrainLogger(cf, config.get_path_run(self.cf)) + self.last_grad_norm = 0.0 def inference(self, cf, run_id_trained, epoch): # general initalization @@ -482,7 +483,19 @@ def train(self, epoch): # gradient clipping self.grad_scaler.unscale_(self.optimizer) - torch.nn.utils.clip_grad_norm_(self.ddp_model.parameters(), max_norm=cf.grad_clip) + total_norm = torch.nn.utils.clip_grad_norm_( + self.ddp_model.parameters(), max_norm=cf.grad_clip + ) + + # log gradient norms + if bidx % log_interval == 0: + grad_norms = { "total_grad_norm" : total_norm.item() } + self.last_grad_norm = total_norm.item() + for name, param in self.ddp_model.named_parameters(): + if param.grad is not None: + grad_norms[name] = param.grad.norm().item() + self.train_logger.log_metrics(TRAIN, grad_norms) + # optimizer step self.grad_scaler.step(self.optimizer) @@ -718,7 +731,7 @@ def _log_terminal(self, bidx: int, epoch: int, stage: Stage): # samples per sec dt = time.time() - self.t_start pstr = "{:03d} : {:05d}/{:05d} : {:06d} : loss = {:.4E} " - pstr += "(lr={:.2E}, s/sec={:.3f})" + pstr += "(lr={:.2E}, gradient norm={:.3f}, s/sec={:.3f})" len_dataset = len(self.data_loader) // self.cf.batch_size_per_gpu print( pstr.format( @@ -728,6 +741,7 @@ def _log_terminal(self, bidx: int, epoch: int, stage: Stage): self.cf.istep, avg_loss.nanmean().item(), self.lr_scheduler.get_lr(), + self.last_grad_norm, (self.print_freq * self.cf.batch_size_per_gpu) / dt, ), flush=True, diff --git a/src/weathergen/utils/train_logger.py b/src/weathergen/utils/train_logger.py index be70a243b..c4db39172 100644 --- a/src/weathergen/utils/train_logger.py +++ b/src/weathergen/utils/train_logger.py @@ -146,6 +146,8 @@ def add_train( metrics[_performance_gpu] = perf_gpu if perf_mem > 0.0: metrics[_performance_memory] = perf_mem + + self.log_metrics("train", metrics) with open(self.path_run / (self.cf.run_id + "_perf_log.txt"), "ab") as f: np.savetxt(f, log_vals) From e83903b5f6799854933550dbe3ef4b0ac36b227c Mon Sep 17 00:00:00 2001 From: sophiex <24638638+sophie-xhonneux@users.noreply.github.com> Date: Wed, 6 Aug 2025 15:24:05 +0000 Subject: [PATCH 02/19] Prototype for recording grad norms --- pyproject.toml | 1 + src/weathergen/train/trainer.py | 2 +- src/weathergen/utils/plot_grad_norms.py | 483 ++++++++++++++++++++++++ uv.lock | 16 + 4 files changed, 501 insertions(+), 1 deletion(-) create mode 100644 src/weathergen/utils/plot_grad_norms.py diff --git a/pyproject.toml b/pyproject.toml index aa6232bb8..7511b0327 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "dask~=2025.5.1", "hatchling", "weathergen-common", + "seaborn>=0.13.2", ] [project.urls] diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py index 46653b9e9..4430211ac 100644 --- a/src/weathergen/train/trainer.py +++ b/src/weathergen/train/trainer.py @@ -493,7 +493,7 @@ def train(self, epoch): self.last_grad_norm = total_norm.item() for name, param in self.ddp_model.named_parameters(): if param.grad is not None: - grad_norms[name] = param.grad.norm().item() + grad_norms["grad_norm_" + name] = param.grad.norm().item() self.train_logger.log_metrics(TRAIN, grad_norms) diff --git a/src/weathergen/utils/plot_grad_norms.py b/src/weathergen/utils/plot_grad_norms.py new file mode 100644 index 000000000..8a6ded4ac --- /dev/null +++ b/src/weathergen/utils/plot_grad_norms.py @@ -0,0 +1,483 @@ +import json +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from pathlib import Path +import seaborn as sns +from collections import defaultdict +import re + +class GradientNormsAnalyzer: + def __init__(self, json_file_path): + """ + Initialize the analyzer with path to JSON file containing gradient norms. + Expected format: one JSON object per line with step info and gradient norms. + """ + self.json_file_path = Path(json_file_path) + self.data = [] + self.df = None + self.load_data() + + def load_data(self): + """Load and parse the JSON data from file.""" + print(f"Loading data from {self.json_file_path}...") + + with open(self.json_file_path, 'r') as f: + for line_num, line in enumerate(f, 1): + try: + data_point = json.loads(line.strip()) + self.data.append(data_point) + except json.JSONDecodeError as e: + print(f"Warning: Could not parse line {line_num}: {e}") + + print(f"Loaded {len(self.data)} data points") + self.create_dataframe() + + def create_dataframe(self): + """Convert loaded data into a pandas DataFrame for easier analysis.""" + rows = [] + + for ith, entry in enumerate(self.data): + # step = entry.get('num_samples', entry.get('epoch', 0)) + step = ith * 5 + + # Handle different possible data structures + if 'gradients' in entry: + grad_data = entry['gradients'] + elif 'grad_norms' in entry: + grad_data = entry['grad_norms'] + else: + # Assume all keys except step/epoch are gradient data + grad_data = {k: v for k, v in entry.items() + if 'stream' not in k and ('q_cells' in k or '0' in k)} + + for param_name, norm_value in grad_data.items(): + rows.append({ + 'num_samples': step, + 'parameter': param_name, + 'grad_norm': float(norm_value), + 'layer_type': self.extract_layer_type(param_name), + 'layer_depth': self.extract_layer_depth(param_name) + }) + + self.df = pd.DataFrame(rows) + print(f"Created DataFrame with {len(self.df)} gradient norm records") + + def extract_layer_type(self, param_name): + """Extract layer type from parameter name.""" + param_name_lower = param_name.lower() + + # Handle your specific naming patterns + if param_name_lower.startswith('embeds.'): + if '.embed.' in param_name_lower: + return 'embedding' + elif '.unembed.' in param_name_lower: + return 'unembedding' + elif '.ln_final.' in param_name_lower: + return 'layer_norm_final' + elif 'proj_heads_q' in param_name_lower: + return 'attention_q' + elif 'proj_heads_k' in param_name_lower: + return 'attention_k' + elif 'proj_heads_v' in param_name_lower: + return 'attention_v' + elif 'proj_out' in param_name_lower: + return 'attention_out' + elif '.layers.' in param_name_lower and ('weight' in param_name_lower or 'bias' in param_name_lower): + return 'ffn' + else: + return 'embeds_other' + + elif param_name_lower.startswith('ae_local_blocks.'): + if 'proj_heads_q' in param_name_lower: + return 'ae_local_attention_q' + elif 'proj_heads_k' in param_name_lower: + return 'ae_local_attention_k' + elif 'proj_heads_v' in param_name_lower: + return 'ae_local_attention_v' + elif 'proj_out' in param_name_lower: + return 'ae_local_attention_out' + elif '.layers.' in param_name_lower: + return 'ae_local_ffn' + else: + return 'ae_local_other' + + elif param_name_lower.startswith('ae_global_blocks.'): + if 'proj_heads_q' in param_name_lower: + return 'ae_global_attention_q' + elif 'proj_heads_k' in param_name_lower: + return 'ae_global_attention_k' + elif 'proj_heads_v' in param_name_lower: + return 'ae_global_attention_v' + elif 'proj_out' in param_name_lower: + return 'ae_global_attention_out' + elif '.layers.' in param_name_lower: + return 'ae_global_ffn' + else: + return 'ae_global_other' + + elif param_name_lower.startswith('ae_adapter.'): + if 'proj_heads_q' in param_name_lower: + return 'ae_adapter_attention_q' + elif 'proj_heads_k' in param_name_lower: + return 'ae_adapter_attention_k' + elif 'proj_heads_v' in param_name_lower: + return 'ae_adapter_attention_v' + elif 'proj_out' in param_name_lower: + return 'ae_adapter_attention_out' + elif '.layers.' in param_name_lower: + return 'ae_adapter_ffn' + else: + return 'ae_adapter_other' + + elif param_name_lower.startswith('target_token_engines.'): + if 'proj_heads_q' in param_name_lower: + return 'tte_attention_q' + elif 'proj_heads_k' in param_name_lower: + return 'tte_attention_k' + elif 'proj_heads_v' in param_name_lower: + return 'tte_attention_v' + elif 'proj_out' in param_name_lower: + return 'tte_attention_out' + elif 'embed_aux' in param_name_lower: + return 'tte_embed_aux' + elif 'lnorm' in param_name_lower: + return 'tte_layer_norm' + elif '.layers.' in param_name_lower: + return 'tte_ffn' + else: + return 'tte_other' + + elif param_name_lower.startswith('embed_target_coords.'): + return 'target_coords_embedding' + + elif param_name_lower.startswith('pred_heads.'): + return 'prediction_head' + + # Fallback for standard patterns (if any) + elif 'embed' in param_name_lower: + return 'embedding' + elif 'attention' in param_name_lower or 'attn' in param_name_lower: + if 'q_proj' in param_name_lower or 'query' in param_name_lower: + return 'attention_q' + elif 'k_proj' in param_name_lower or 'key' in param_name_lower: + return 'attention_k' + elif 'v_proj' in param_name_lower or 'value' in param_name_lower: + return 'attention_v' + elif 'o_proj' in param_name_lower or 'out' in param_name_lower: + return 'attention_out' + else: + return 'attention' + elif 'layernorm' in param_name_lower or 'layer_norm' in param_name_lower or 'ln' in param_name_lower: + return 'layernorm' + else: + return 'other' + + def extract_layer_depth(self, param_name): + """Extract layer depth/index from parameter name.""" + param_name_lower = param_name.lower() + + # Look for patterns specific to your architecture + patterns = [ + # embeds.0.layers.N.* (transformer layers within embeds) + r'embeds\.\d+\.layers\.(\d+)\.', + # embeds.0.unembed.N.* (unembedding layers) + r'embeds\.\d+\.unembed\.(\d+)\.', + # embeds.0.ln_final.N.* (final layer norms) + r'embeds\.\d+\.ln_final\.(\d+)\.', + # ae_local_blocks.N.* (autoencoder local blocks) + r'ae_local_blocks\.(\d+)\.', + # ae_global_blocks.N.* (autoencoder global blocks) + r'ae_global_blocks\.(\d+)\.', + # ae_adapter.N.* (autoencoder adapter blocks) + r'ae_adapter\.(\d+)\.', + # target_token_engines.0.tte.N.* (target token engine blocks) + r'target_token_engines\.\d+\.tte\.(\d+)\.', + # target_token_engines.0.tte.N.block.M.* (nested blocks) + r'target_token_engines\.\d+\.tte\.(\d+)\.block\.(\d+)\.', + # pred_heads.0.pred_heads.0.N.* (prediction head layers) + r'pred_heads\.\d+\.pred_heads\.\d+\.(\d+)\.', + # Generic patterns for any numbered layers + r'layer[s]?\.(\d+)', + r'h\.(\d+)', + r'transformer\.(\d+)', + r'blocks\.(\d+)', + ] + + for pattern in patterns: + match = re.search(pattern, param_name_lower) + if match: + # For nested patterns (like tte blocks), combine indices + if len(match.groups()) > 1: + # Combine indices: e.g., tte.1.block.2 -> 12 (or 1*10+2) + return int(match.group(1)) * 10 + int(match.group(2)) + else: + return int(match.group(1)) + + # Special handling for components without clear depth + if param_name_lower.startswith('embed_target_coords.'): + return 0 # Coordinate embeddings at the start + elif 'total_grad_norm' in param_name_lower: + return -2 # Special marker for total norm + elif any(x in param_name_lower for x in ['weathergen', 'stage', 'q_cells']): + return -3 # Special marker for metadata + + return -1 # Unknown depth + + def plot_total_gradient_norms(self, figsize=(12, 6)): + """Plot total gradient norm over training steps.""" + # Calculate total norm per step + total_norms = [] + steps = [] + + for ith, entry in enumerate(self.data): + # step = entry.get('num_samples', entry.get('epoch', 0)) + step = ith * 5 + + if 'gradients' in entry: + grad_data = entry['gradients'] + elif 'grad_norms' in entry: + grad_data = entry['grad_norms'] + else: + grad_data = {k: v for k, v in entry.items() + if 'q_cells' in k or '0' in k} + + if len(grad_data) == 0: + continue + + # Calculate total norm (L2 norm of all gradients) + total_norm = np.sqrt(sum(float(v)**2 for v in grad_data.values())) + total_norms.append(total_norm) + steps.append(step) + + plt.figure(figsize=figsize) + plt.plot(steps, total_norms, linewidth=1.5, alpha=0.8) + plt.xlabel('Training Step') + plt.ylabel('Total Gradient Norm') + plt.title('Total Gradient Norm vs Training Steps') + plt.yscale('log') + plt.grid(True, alpha=0.3) + plt.tight_layout() + plt.savefig("plots/total_grad_norm.png") + + return steps, total_norms + + def plot_layer_type_norms(self, figsize=(14, 8)): + """Plot gradient norms grouped by layer type.""" + if self.df is None: + print("No DataFrame available. Load data first.") + return + + plt.figure(figsize=figsize) + + # Get unique layer types + layer_types = self.df['layer_type'].unique() + print(layer_types) + colors = plt.cm.tab10(np.linspace(0, 1, len(layer_types))) + + for i, layer_type in enumerate(layer_types): + layer_data = self.df[self.df['layer_type'] == layer_type] + + # Calculate mean gradient norm per step for this layer type + mean_norms = layer_data.groupby('num_samples')['grad_norm'].mean() + + plt.plot(mean_norms.index, mean_norms.values, + label=layer_type, color=colors[i], alpha=0.8) + + plt.xlabel('Training Step') + plt.ylabel('Mean Gradient Norm') + plt.title('Gradient Norms by Layer Type') + plt.yscale('log') + plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') + plt.grid(True, alpha=0.3) + plt.tight_layout() + plt.savefig("plots/grad_norm_by_layer_type.png") + + def plot_layer_depth_analysis(self, figsize=(12, 8)): + """Plot gradient norms by layer depth.""" + if self.df is None: + print("No DataFrame available. Load data first.") + return + + # Filter out unknown depths + depth_data = self.df[self.df['layer_depth'] >= 0] + + if len(depth_data) == 0: + print("No layer depth information found in parameter names.") + return + + fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize) + + # Plot 1: Mean gradient norm by depth over time + depths = sorted(depth_data['layer_depth'].unique()) + colors = plt.cm.viridis(np.linspace(0, 1, len(depths))) + + for i, depth in enumerate(depths): + layer_data = depth_data[depth_data['layer_depth'] == depth] + mean_norms = layer_data.groupby('num_samples')['grad_norm'].mean() + + ax1.plot(mean_norms.index, mean_norms.values, + label=f'Layer {depth}', color=colors[i], alpha=0.8) + + ax1.set_xlabel('Training Step') + ax1.set_ylabel('Mean Gradient Norm') + ax1.set_title('Gradient Norms by Layer Depth') + ax1.set_yscale('log') + ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left') + ax1.grid(True, alpha=0.3) + + # Plot 2: Heatmap of gradient norms by depth and step + pivot_data = depth_data.groupby(['num_samples', 'layer_depth'])['grad_norm'].mean().unstack() + + # Sample data if too many steps for readability + if len(pivot_data) > 100: + sample_idx = np.linspace(0, len(pivot_data)-1, 100, dtype=int) + pivot_data = pivot_data.iloc[sample_idx] + + im = ax2.imshow(pivot_data.T, aspect='auto', cmap='viridis', + extent=[pivot_data.index.min(), pivot_data.index.max(), + pivot_data.columns.min(), pivot_data.columns.max()]) + ax2.set_xlabel('Training Step') + ax2.set_ylabel('Layer Depth') + ax2.set_title('Gradient Norm Heatmap (Layer Depth vs Step)') + + cbar = plt.colorbar(im, ax=ax2) + cbar.set_label('Gradient Norm') + + plt.tight_layout() + plt.savefig("plots/grad_norm_heatmap.png") + + def plot_gradient_distribution(self, figsize=(15, 10)): + """Plot distribution of gradient norms.""" + if self.df is None: + print("No DataFrame available. Load data first.") + return + + fig, axes = plt.subplots(2, 2, figsize=figsize) + + # Plot 1: Histogram of all gradient norms + axes[0, 0].hist(np.log10(self.df['grad_norm'].values), bins=50, alpha=0.7) + axes[0, 0].set_xlabel('Log10(Gradient Norm)') + axes[0, 0].set_ylabel('Frequency') + axes[0, 0].set_title('Distribution of Gradient Norms (Log Scale)') + axes[0, 0].grid(True, alpha=0.3) + + # Plot 2: Box plot by layer type + layer_types = self.df['layer_type'].unique()[:10] # Limit to 10 for readability + plot_data = [np.log10(self.df[self.df['layer_type'] == lt]['grad_norm'].values) + for lt in layer_types] + + axes[0, 1].boxplot(plot_data, labels=layer_types) + axes[0, 1].set_xlabel('Layer Type') + axes[0, 1].set_ylabel('Log10(Gradient Norm)') + axes[0, 1].set_title('Gradient Norm Distribution by Layer Type') + axes[0, 1].tick_params(axis='x', rotation=45) + axes[0, 1].grid(True, alpha=0.3) + + # Plot 3: Gradient norms over time (sample of parameters) + sample_params = self.df['parameter'].unique()[:20] # Sample 20 parameters + for param in sample_params: + param_data = self.df[self.df['parameter'] == param] + axes[1, 0].plot(param_data['num_samples'], param_data['grad_norm'], + alpha=0.6, linewidth=0.8) + + axes[1, 0].set_xlabel('Training Step') + axes[1, 0].set_ylabel('Gradient Norm') + axes[1, 0].set_title('Individual Parameter Gradient Norms (Sample)') + axes[1, 0].set_yscale('log') + axes[1, 0].grid(True, alpha=0.3) + + # Plot 4: Statistics over time + stats_by_step = self.df.groupby('num_samples')['grad_norm'].agg(['mean', 'std', 'min', 'max']) + + axes[1, 1].fill_between(stats_by_step.index, + stats_by_step['mean'] - stats_by_step['std'], + stats_by_step['mean'] + stats_by_step['std'], + alpha=0.3, label='±1 std') + axes[1, 1].plot(stats_by_step.index, stats_by_step['mean'], + label='Mean', linewidth=2) + axes[1, 1].plot(stats_by_step.index, stats_by_step['max'], + label='Max', linewidth=1, alpha=0.8) + axes[1, 1].plot(stats_by_step.index, stats_by_step['min'], + label='Min', linewidth=1, alpha=0.8) + + axes[1, 1].set_xlabel('Training Step') + axes[1, 1].set_ylabel('Gradient Norm') + axes[1, 1].set_title('Gradient Norm Statistics Over Time') + axes[1, 1].set_yscale('log') + axes[1, 1].legend() + axes[1, 1].grid(True, alpha=0.3) + + plt.tight_layout() + plt.savefig("plots/grad_norm_over_time.png") + + def generate_summary_report(self): + """Generate a summary report of gradient norm statistics.""" + if self.df is None: + print("No DataFrame available. Load data first.") + return + + print("=== GRADIENT NORMS ANALYSIS REPORT ===") + print(f"Total data points: {len(self.df)}") + print(f"Training steps: {self.df['num_samples'].nunique()}") + print(f"Unique parameters: {self.df['parameter'].nunique()}") + print() + + print("Overall Statistics:") + print(f"Mean gradient norm: {self.df['grad_norm'].mean():.6f}") + print(f"Median gradient norm: {self.df['grad_norm'].median():.6f}") + print(f"Min gradient norm: {self.df['grad_norm'].min():.6f}") + print(f"Max gradient norm: {self.df['grad_norm'].max():.6f}") + print() + + print("Statistics by Layer Type:") + layer_stats = self.df.groupby('layer_type')['grad_norm'].agg(['count', 'mean', 'std', 'min', 'max']) + print(layer_stats) + print() + + # Check for potential issues + print("Potential Issues:") + very_small = (self.df['grad_norm'] < 1e-6).sum() + very_large = (self.df['grad_norm'] > 10.0).sum() + + if very_small > 0: + print(f"⚠️ {very_small} gradient norms < 1e-6 (possible vanishing gradients)") + if very_large > 0: + print(f"⚠️ {very_large} gradient norms > 10.0 (possible exploding gradients)") + + if very_small == 0 and very_large == 0: + print("✅ No obvious gradient issues detected") + +# Usage example +def analyze_gradient_file(json_file_path): + """ + Main function to analyze gradient norms from a JSON file. + + Usage: + analyze_gradient_file('gradient_norms.jsonl') + """ + + analyzer = GradientNormsAnalyzer(json_file_path) + + # Generate summary report + analyzer.generate_summary_report() + + # Create all plots + print("\n=== GENERATING PLOTS ===") + + print("1. Total gradient norms over time...") + analyzer.plot_total_gradient_norms() + + print("2. Gradient norms by layer type...") + analyzer.plot_layer_type_norms() + + print("3. Layer depth analysis...") + analyzer.plot_layer_depth_analysis() + + print("4. Gradient distribution analysis...") + analyzer.plot_gradient_distribution() + + return analyzer + +# Example usage: +analyzer = analyze_gradient_file('results/yvhxm2jc/yvhxm2jc_train_metrics.json') diff --git a/uv.lock b/uv.lock index 51d6a0485..253e7171a 100644 --- a/uv.lock +++ b/uv.lock @@ -1614,6 +1614,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e6/eb/3bf6ea8ab7f1503dca3a10df2e4b9c3f6b3316df07f6c0ded94b281c7101/scipy-1.15.3-cp312-cp312-win_amd64.whl", hash = "sha256:52092bc0472cfd17df49ff17e70624345efece4e1a12b23783a1ac59a1b728ed", size = 40966184, upload-time = "2025-05-08T16:06:52.623Z" }, ] +[[package]] +name = "seaborn" +version = "0.13.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "matplotlib" }, + { name = "numpy" }, + { name = "pandas" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/86/59/a451d7420a77ab0b98f7affa3a1d78a313d2f7281a57afb1a34bae8ab412/seaborn-0.13.2.tar.gz", hash = "sha256:93e60a40988f4d65e9f4885df477e2fdaff6b73a9ded434c1ab356dd57eefff7", size = 1457696, upload-time = "2024-01-25T13:21:52.551Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/83/11/00d3c3dfc25ad54e731d91449895a79e4bf2384dc3ac01809010ba88f6d5/seaborn-0.13.2-py3-none-any.whl", hash = "sha256:636f8336facf092165e27924f223d3c62ca560b1f2bb5dff7ab7fad265361987", size = 294914, upload-time = "2024-01-25T13:21:49.598Z" }, +] + [[package]] name = "semantic-version" version = "2.10.0" @@ -1897,6 +1911,7 @@ dependencies = [ { name = "polars" }, { name = "psutil" }, { name = "pynvml" }, + { name = "seaborn" }, { name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' and sys_platform != 'macosx' and sys_platform != 'win32'" }, { name = "torch", version = "2.6.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "sys_platform == 'macosx'" }, { name = "torch", version = "2.6.0+cu124", source = { registry = "https://download.pytorch.org/whl/cu124" }, marker = "sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -1928,6 +1943,7 @@ requires-dist = [ { name = "polars", specifier = "~=1.25.2" }, { name = "psutil" }, { name = "pynvml" }, + { name = "seaborn", specifier = ">=0.13.2" }, { name = "torch", marker = "sys_platform != 'linux' and sys_platform != 'macosx' and sys_platform != 'win32'", specifier = "==2.6.0" }, { name = "torch", marker = "sys_platform == 'linux' or sys_platform == 'win32'", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cu124" }, { name = "torch", marker = "sys_platform == 'macosx'", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cpu" }, From d2995b4b6d2b3a7b2ac71c4312eb670f082f1298 Mon Sep 17 00:00:00 2001 From: sophiex <24638638+sophie-xhonneux@users.noreply.github.com> Date: Thu, 7 Aug 2025 10:17:48 +0000 Subject: [PATCH 03/19] Address review changes + hide behind feature flag --- config/default_config.yml | 1 + src/weathergen/train/trainer.py | 29 +++++++++++++++++++------ src/weathergen/utils/plot_grad_norms.py | 5 ++++- 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/config/default_config.yml b/config/default_config.yml index e8f21204a..403b1c20d 100644 --- a/config/default_config.yml +++ b/config/default_config.yml @@ -105,6 +105,7 @@ grad_clip: 1.0 weight_decay: 0.1 norm_type: "LayerNorm" nn_module: "te" +log_grad_norms: True start_date: 197901010000 end_date: 202012310000 diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py index 4430211ac..b8bf07ea4 100644 --- a/src/weathergen/train/trainer.py +++ b/src/weathergen/train/trainer.py @@ -13,6 +13,8 @@ import time from typing import Any +from omegaconf import OmegaConf + import numpy as np import torch import tqdm @@ -54,6 +56,10 @@ def init( ): self.cf = cf + self.cf = OmegaConf.merge( + OmegaConf.create({"log_grad_norms": False}), self.cf + ) + assert cf.samples_per_epoch % cf.batch_size_per_gpu == 0 assert cf.samples_per_validation % cf.batch_size_validation_per_gpu == 0 assert cf.forecast_policy if cf.forecast_steps > 0 else True @@ -76,7 +82,6 @@ def init( self.init_perf_monitoring() self.train_logger = TrainLogger(cf, config.get_path_run(self.cf)) - self.last_grad_norm = 0.0 def inference(self, cf, run_id_trained, epoch): # general initalization @@ -459,6 +464,7 @@ def train(self, epoch): # Unweighted loss, real weighted loss, std for losses that need it self.loss_unweighted_hist, self.loss_model_hist, self.stdev_unweighted_hist = [], [], [] + self.last_grad_norm = 0.0 # training loop self.t_start = time.time() @@ -489,12 +495,7 @@ def train(self, epoch): # log gradient norms if bidx % log_interval == 0: - grad_norms = { "total_grad_norm" : total_norm.item() } - self.last_grad_norm = total_norm.item() - for name, param in self.ddp_model.named_parameters(): - if param.grad is not None: - grad_norms["grad_norm_" + name] = param.grad.norm().item() - self.train_logger.log_metrics(TRAIN, grad_norms) + self._log_instant_grad_norms(TRAIN, total_norm) # optimizer step @@ -709,6 +710,20 @@ def _log(self, stage: Stage): self.loss_unweighted_hist, self.loss_model_hist, self.stdev_unweighted_hist = [], [], [] + def _log_instant_grad_norms(self, stage: Stage, total_norm): + """ + Log instantaneous grad norms, we do not average because of the cost and because we want to + measure the actual values + + TODO test DDP case + """ + grad_norms = { "total_grad_norm" : total_norm.item() } + self.last_grad_norm = total_norm.item() + for name, param in self.ddp_model.named_parameters(): + if param.grad is not None: + grad_norms["grad_norm_" + name] = param.grad.norm().item() + self.train_logger.log_metrics(TRAIN, grad_norms) + def _log_terminal(self, bidx: int, epoch: int, stage: Stage): if bidx % self.print_freq == 0 and bidx > 0 or stage == VAL: # compute from last iteration diff --git a/src/weathergen/utils/plot_grad_norms.py b/src/weathergen/utils/plot_grad_norms.py index 8a6ded4ac..0ff1a1f5c 100644 --- a/src/weathergen/utils/plot_grad_norms.py +++ b/src/weathergen/utils/plot_grad_norms.py @@ -480,4 +480,7 @@ def analyze_gradient_file(json_file_path): return analyzer # Example usage: -analyzer = analyze_gradient_file('results/yvhxm2jc/yvhxm2jc_train_metrics.json') +# uv run python src/weathergen/utils/plot_grad_norms.py results/yvhxm2jc/yvhxm2jc_train_metrics.json +if __name__ == '__main__': + import sys + analyzer = analyze_gradient_file(sys.argv[1]) From 26c6869eccfc595173db9f11e94ad3f62b1ad210 Mon Sep 17 00:00:00 2001 From: sophiex <24638638+sophie-xhonneux@users.noreply.github.com> Date: Thu, 7 Aug 2025 10:49:05 +0000 Subject: [PATCH 04/19] Final fixes including backward compatibility --- config/default_config.yml | 2 +- src/weathergen/train/trainer.py | 13 +++++-------- src/weathergen/utils/plot_grad_norms.py | 14 +++++++------- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/config/default_config.yml b/config/default_config.yml index 403b1c20d..9fa9d359e 100644 --- a/config/default_config.yml +++ b/config/default_config.yml @@ -105,7 +105,7 @@ grad_clip: 1.0 weight_decay: 0.1 norm_type: "LayerNorm" nn_module: "te" -log_grad_norms: True +log_grad_norms: False start_date: 197901010000 end_date: 202012310000 diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py index b8bf07ea4..9619c93d2 100644 --- a/src/weathergen/train/trainer.py +++ b/src/weathergen/train/trainer.py @@ -56,10 +56,6 @@ def init( ): self.cf = cf - self.cf = OmegaConf.merge( - OmegaConf.create({"log_grad_norms": False}), self.cf - ) - assert cf.samples_per_epoch % cf.batch_size_per_gpu == 0 assert cf.samples_per_validation % cf.batch_size_validation_per_gpu == 0 assert cf.forecast_policy if cf.forecast_steps > 0 else True @@ -72,6 +68,8 @@ def init( # num_ranks gets overwritten by current setting during init_ddp() self.num_ranks_original = cf.get("num_ranks", None) + self.log_grad_norms = cf.get("log_grad_norms", False) + # TODO remove num_ranks, rank, with_with ddp from config self.init_ddp(cf) @@ -494,10 +492,9 @@ def train(self, epoch): ) # log gradient norms - if bidx % log_interval == 0: + if bidx % log_interval == 0 and self.log_grad_norms: self._log_instant_grad_norms(TRAIN, total_norm) - # optimizer step self.grad_scaler.step(self.optimizer) self.grad_scaler.update() @@ -712,12 +709,12 @@ def _log(self, stage: Stage): def _log_instant_grad_norms(self, stage: Stage, total_norm): """ - Log instantaneous grad norms, we do not average because of the cost and because we want to + Log instantaneous grad norms, we do not average because of the cost and because we want to measure the actual values TODO test DDP case """ - grad_norms = { "total_grad_norm" : total_norm.item() } + grad_norms = {"total_grad_norm": total_norm.item()} self.last_grad_norm = total_norm.item() for name, param in self.ddp_model.named_parameters(): if param.grad is not None: diff --git a/src/weathergen/utils/plot_grad_norms.py b/src/weathergen/utils/plot_grad_norms.py index 0ff1a1f5c..de50ad8f5 100644 --- a/src/weathergen/utils/plot_grad_norms.py +++ b/src/weathergen/utils/plot_grad_norms.py @@ -49,7 +49,7 @@ def create_dataframe(self): else: # Assume all keys except step/epoch are gradient data grad_data = {k: v for k, v in entry.items() - if 'stream' not in k and ('q_cells' in k or '0' in k)} + if 'stream' not in k and ('grad_norm' in k)} for param_name, norm_value in grad_data.items(): rows.append({ @@ -65,7 +65,7 @@ def create_dataframe(self): def extract_layer_type(self, param_name): """Extract layer type from parameter name.""" - param_name_lower = param_name.lower() + param_name_lower = param_name.lower()[10:] # Handle your specific naming patterns if param_name_lower.startswith('embeds.'): @@ -180,13 +180,13 @@ def extract_layer_depth(self, param_name): # Look for patterns specific to your architecture patterns = [ # embeds.0.layers.N.* (transformer layers within embeds) - r'embeds\.\d+\.layers\.(\d+)\.', + r'grad_norm_embeds\.\d+\.layers\.(\d+)\.', # embeds.0.unembed.N.* (unembedding layers) - r'embeds\.\d+\.unembed\.(\d+)\.', + r'grad_norm_embeds\.\d+\.unembed\.(\d+)\.', # embeds.0.ln_final.N.* (final layer norms) - r'embeds\.\d+\.ln_final\.(\d+)\.', + r'grad_norm_embeds\.\d+\.ln_final\.(\d+)\.', # ae_local_blocks.N.* (autoencoder local blocks) - r'ae_local_blocks\.(\d+)\.', + r'grad_norm_ae_local_blocks\.(\d+)\.', # ae_global_blocks.N.* (autoencoder global blocks) r'ae_global_blocks\.(\d+)\.', # ae_adapter.N.* (autoencoder adapter blocks) @@ -240,7 +240,7 @@ def plot_total_gradient_norms(self, figsize=(12, 6)): grad_data = entry['grad_norms'] else: grad_data = {k: v for k, v in entry.items() - if 'q_cells' in k or '0' in k} + if 'grad_norm' in k} if len(grad_data) == 0: continue From 9a66f7217d79a44700fa6d4280ae9b0f2eccc714 Mon Sep 17 00:00:00 2001 From: sophiex <24638638+sophie-xhonneux@users.noreply.github.com> Date: Thu, 7 Aug 2025 10:51:40 +0000 Subject: [PATCH 05/19] Ruff --- src/weathergen/train/trainer.py | 2 -- src/weathergen/utils/train_logger.py | 1 - 2 files changed, 3 deletions(-) diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py index 41a9aab68..b65987484 100644 --- a/src/weathergen/train/trainer.py +++ b/src/weathergen/train/trainer.py @@ -13,8 +13,6 @@ import time from typing import Any -from omegaconf import OmegaConf - import numpy as np import torch import tqdm diff --git a/src/weathergen/utils/train_logger.py b/src/weathergen/utils/train_logger.py index b6840df31..f60e748f7 100644 --- a/src/weathergen/utils/train_logger.py +++ b/src/weathergen/utils/train_logger.py @@ -149,7 +149,6 @@ def add_train( if perf_mem > 0.0: metrics[_performance_memory] = perf_mem - self.log_metrics("train", metrics) with open(self.path_run / (self.cf.run_id + "_perf_log.txt"), "ab") as f: np.savetxt(f, log_vals) From 22a6fd72d9903dfd9b804d463a93c9d06df782f8 Mon Sep 17 00:00:00 2001 From: sophiex <24638638+sophie-xhonneux@users.noreply.github.com> Date: Thu, 7 Aug 2025 12:01:38 +0000 Subject: [PATCH 06/19] More ruff stuff --- src/weathergen/utils/plot_grad_norms.py | 593 +++++++++++++----------- 1 file changed, 316 insertions(+), 277 deletions(-) diff --git a/src/weathergen/utils/plot_grad_norms.py b/src/weathergen/utils/plot_grad_norms.py index de50ad8f5..ec310c0fc 100644 --- a/src/weathergen/utils/plot_grad_norms.py +++ b/src/weathergen/utils/plot_grad_norms.py @@ -1,11 +1,13 @@ import json +import re +from pathlib import Path + import matplotlib.pyplot as plt import numpy as np import pandas as pd -from pathlib import Path -import seaborn as sns -from collections import defaultdict -import re + +# ruff: noqa: T201 + class GradientNormsAnalyzer: def __init__(self, json_file_path): @@ -17,193 +19,202 @@ def __init__(self, json_file_path): self.data = [] self.df = None self.load_data() - + def load_data(self): """Load and parse the JSON data from file.""" print(f"Loading data from {self.json_file_path}...") - - with open(self.json_file_path, 'r') as f: + + with open(self.json_file_path) as f: for line_num, line in enumerate(f, 1): try: data_point = json.loads(line.strip()) self.data.append(data_point) except json.JSONDecodeError as e: print(f"Warning: Could not parse line {line_num}: {e}") - + print(f"Loaded {len(self.data)} data points") self.create_dataframe() - + def create_dataframe(self): """Convert loaded data into a pandas DataFrame for easier analysis.""" rows = [] - + for ith, entry in enumerate(self.data): # step = entry.get('num_samples', entry.get('epoch', 0)) step = ith * 5 - + # Handle different possible data structures - if 'gradients' in entry: - grad_data = entry['gradients'] - elif 'grad_norms' in entry: - grad_data = entry['grad_norms'] + if "gradients" in entry: + grad_data = entry["gradients"] + elif "grad_norms" in entry: + grad_data = entry["grad_norms"] else: # Assume all keys except step/epoch are gradient data - grad_data = {k: v for k, v in entry.items() - if 'stream' not in k and ('grad_norm' in k)} - + grad_data = { + k: v for k, v in entry.items() if "stream" not in k and ("grad_norm" in k) + } + for param_name, norm_value in grad_data.items(): - rows.append({ - 'num_samples': step, - 'parameter': param_name, - 'grad_norm': float(norm_value), - 'layer_type': self.extract_layer_type(param_name), - 'layer_depth': self.extract_layer_depth(param_name) - }) - + rows.append( + { + "num_samples": step, + "parameter": param_name, + "grad_norm": float(norm_value), + "layer_type": self.extract_layer_type(param_name), + "layer_depth": self.extract_layer_depth(param_name), + } + ) + self.df = pd.DataFrame(rows) print(f"Created DataFrame with {len(self.df)} gradient norm records") - + def extract_layer_type(self, param_name): """Extract layer type from parameter name.""" param_name_lower = param_name.lower()[10:] - + # Handle your specific naming patterns - if param_name_lower.startswith('embeds.'): - if '.embed.' in param_name_lower: - return 'embedding' - elif '.unembed.' in param_name_lower: - return 'unembedding' - elif '.ln_final.' in param_name_lower: - return 'layer_norm_final' - elif 'proj_heads_q' in param_name_lower: - return 'attention_q' - elif 'proj_heads_k' in param_name_lower: - return 'attention_k' - elif 'proj_heads_v' in param_name_lower: - return 'attention_v' - elif 'proj_out' in param_name_lower: - return 'attention_out' - elif '.layers.' in param_name_lower and ('weight' in param_name_lower or 'bias' in param_name_lower): - return 'ffn' + if param_name_lower.startswith("embeds."): + if ".embed." in param_name_lower: + return "embedding" + elif ".unembed." in param_name_lower: + return "unembedding" + elif ".ln_final." in param_name_lower: + return "layer_norm_final" + elif "proj_heads_q" in param_name_lower: + return "attention_q" + elif "proj_heads_k" in param_name_lower: + return "attention_k" + elif "proj_heads_v" in param_name_lower: + return "attention_v" + elif "proj_out" in param_name_lower: + return "attention_out" + elif ".layers." in param_name_lower and ( + "weight" in param_name_lower or "bias" in param_name_lower + ): + return "ffn" else: - return 'embeds_other' - - elif param_name_lower.startswith('ae_local_blocks.'): - if 'proj_heads_q' in param_name_lower: - return 'ae_local_attention_q' - elif 'proj_heads_k' in param_name_lower: - return 'ae_local_attention_k' - elif 'proj_heads_v' in param_name_lower: - return 'ae_local_attention_v' - elif 'proj_out' in param_name_lower: - return 'ae_local_attention_out' - elif '.layers.' in param_name_lower: - return 'ae_local_ffn' + return "embeds_other" + + elif param_name_lower.startswith("ae_local_blocks."): + if "proj_heads_q" in param_name_lower: + return "ae_local_attention_q" + elif "proj_heads_k" in param_name_lower: + return "ae_local_attention_k" + elif "proj_heads_v" in param_name_lower: + return "ae_local_attention_v" + elif "proj_out" in param_name_lower: + return "ae_local_attention_out" + elif ".layers." in param_name_lower: + return "ae_local_ffn" else: - return 'ae_local_other' - - elif param_name_lower.startswith('ae_global_blocks.'): - if 'proj_heads_q' in param_name_lower: - return 'ae_global_attention_q' - elif 'proj_heads_k' in param_name_lower: - return 'ae_global_attention_k' - elif 'proj_heads_v' in param_name_lower: - return 'ae_global_attention_v' - elif 'proj_out' in param_name_lower: - return 'ae_global_attention_out' - elif '.layers.' in param_name_lower: - return 'ae_global_ffn' + return "ae_local_other" + + elif param_name_lower.startswith("ae_global_blocks."): + if "proj_heads_q" in param_name_lower: + return "ae_global_attention_q" + elif "proj_heads_k" in param_name_lower: + return "ae_global_attention_k" + elif "proj_heads_v" in param_name_lower: + return "ae_global_attention_v" + elif "proj_out" in param_name_lower: + return "ae_global_attention_out" + elif ".layers." in param_name_lower: + return "ae_global_ffn" else: - return 'ae_global_other' - - elif param_name_lower.startswith('ae_adapter.'): - if 'proj_heads_q' in param_name_lower: - return 'ae_adapter_attention_q' - elif 'proj_heads_k' in param_name_lower: - return 'ae_adapter_attention_k' - elif 'proj_heads_v' in param_name_lower: - return 'ae_adapter_attention_v' - elif 'proj_out' in param_name_lower: - return 'ae_adapter_attention_out' - elif '.layers.' in param_name_lower: - return 'ae_adapter_ffn' + return "ae_global_other" + + elif param_name_lower.startswith("ae_adapter."): + if "proj_heads_q" in param_name_lower: + return "ae_adapter_attention_q" + elif "proj_heads_k" in param_name_lower: + return "ae_adapter_attention_k" + elif "proj_heads_v" in param_name_lower: + return "ae_adapter_attention_v" + elif "proj_out" in param_name_lower: + return "ae_adapter_attention_out" + elif ".layers." in param_name_lower: + return "ae_adapter_ffn" else: - return 'ae_adapter_other' - - elif param_name_lower.startswith('target_token_engines.'): - if 'proj_heads_q' in param_name_lower: - return 'tte_attention_q' - elif 'proj_heads_k' in param_name_lower: - return 'tte_attention_k' - elif 'proj_heads_v' in param_name_lower: - return 'tte_attention_v' - elif 'proj_out' in param_name_lower: - return 'tte_attention_out' - elif 'embed_aux' in param_name_lower: - return 'tte_embed_aux' - elif 'lnorm' in param_name_lower: - return 'tte_layer_norm' - elif '.layers.' in param_name_lower: - return 'tte_ffn' + return "ae_adapter_other" + + elif param_name_lower.startswith("target_token_engines."): + if "proj_heads_q" in param_name_lower: + return "tte_attention_q" + elif "proj_heads_k" in param_name_lower: + return "tte_attention_k" + elif "proj_heads_v" in param_name_lower: + return "tte_attention_v" + elif "proj_out" in param_name_lower: + return "tte_attention_out" + elif "embed_aux" in param_name_lower: + return "tte_embed_aux" + elif "lnorm" in param_name_lower: + return "tte_layer_norm" + elif ".layers." in param_name_lower: + return "tte_ffn" else: - return 'tte_other' - - elif param_name_lower.startswith('embed_target_coords.'): - return 'target_coords_embedding' - - elif param_name_lower.startswith('pred_heads.'): - return 'prediction_head' - + return "tte_other" + + elif param_name_lower.startswith("embed_target_coords."): + return "target_coords_embedding" + + elif param_name_lower.startswith("pred_heads."): + return "prediction_head" + # Fallback for standard patterns (if any) - elif 'embed' in param_name_lower: - return 'embedding' - elif 'attention' in param_name_lower or 'attn' in param_name_lower: - if 'q_proj' in param_name_lower or 'query' in param_name_lower: - return 'attention_q' - elif 'k_proj' in param_name_lower or 'key' in param_name_lower: - return 'attention_k' - elif 'v_proj' in param_name_lower or 'value' in param_name_lower: - return 'attention_v' - elif 'o_proj' in param_name_lower or 'out' in param_name_lower: - return 'attention_out' + elif "embed" in param_name_lower: + return "embedding" + elif "attention" in param_name_lower or "attn" in param_name_lower: + if "q_proj" in param_name_lower or "query" in param_name_lower: + return "attention_q" + elif "k_proj" in param_name_lower or "key" in param_name_lower: + return "attention_k" + elif "v_proj" in param_name_lower or "value" in param_name_lower: + return "attention_v" + elif "o_proj" in param_name_lower or "out" in param_name_lower: + return "attention_out" else: - return 'attention' - elif 'layernorm' in param_name_lower or 'layer_norm' in param_name_lower or 'ln' in param_name_lower: - return 'layernorm' + return "attention" + elif ( + "layernorm" in param_name_lower + or "layer_norm" in param_name_lower + or "ln" in param_name_lower + ): + return "layernorm" else: - return 'other' - + return "other" + def extract_layer_depth(self, param_name): """Extract layer depth/index from parameter name.""" param_name_lower = param_name.lower() - + # Look for patterns specific to your architecture patterns = [ # embeds.0.layers.N.* (transformer layers within embeds) - r'grad_norm_embeds\.\d+\.layers\.(\d+)\.', + r"grad_norm_embeds\.\d+\.layers\.(\d+)\.", # embeds.0.unembed.N.* (unembedding layers) - r'grad_norm_embeds\.\d+\.unembed\.(\d+)\.', + r"grad_norm_embeds\.\d+\.unembed\.(\d+)\.", # embeds.0.ln_final.N.* (final layer norms) - r'grad_norm_embeds\.\d+\.ln_final\.(\d+)\.', + r"grad_norm_embeds\.\d+\.ln_final\.(\d+)\.", # ae_local_blocks.N.* (autoencoder local blocks) - r'grad_norm_ae_local_blocks\.(\d+)\.', + r"grad_norm_ae_local_blocks\.(\d+)\.", # ae_global_blocks.N.* (autoencoder global blocks) - r'ae_global_blocks\.(\d+)\.', + r"ae_global_blocks\.(\d+)\.", # ae_adapter.N.* (autoencoder adapter blocks) - r'ae_adapter\.(\d+)\.', + r"ae_adapter\.(\d+)\.", # target_token_engines.0.tte.N.* (target token engine blocks) - r'target_token_engines\.\d+\.tte\.(\d+)\.', + r"target_token_engines\.\d+\.tte\.(\d+)\.", # target_token_engines.0.tte.N.block.M.* (nested blocks) - r'target_token_engines\.\d+\.tte\.(\d+)\.block\.(\d+)\.', + r"target_token_engines\.\d+\.tte\.(\d+)\.block\.(\d+)\.", # pred_heads.0.pred_heads.0.N.* (prediction head layers) - r'pred_heads\.\d+\.pred_heads\.\d+\.(\d+)\.', + r"pred_heads\.\d+\.pred_heads\.\d+\.(\d+)\.", # Generic patterns for any numbered layers - r'layer[s]?\.(\d+)', - r'h\.(\d+)', - r'transformer\.(\d+)', - r'blocks\.(\d+)', + r"layer[s]?\.(\d+)", + r"h\.(\d+)", + r"transformer\.(\d+)", + r"blocks\.(\d+)", ] - + for pattern in patterns: match = re.search(pattern, param_name_lower) if match: @@ -213,274 +224,302 @@ def extract_layer_depth(self, param_name): return int(match.group(1)) * 10 + int(match.group(2)) else: return int(match.group(1)) - + # Special handling for components without clear depth - if param_name_lower.startswith('embed_target_coords.'): + if param_name_lower.startswith("embed_target_coords."): return 0 # Coordinate embeddings at the start - elif 'total_grad_norm' in param_name_lower: + elif "total_grad_norm" in param_name_lower: return -2 # Special marker for total norm - elif any(x in param_name_lower for x in ['weathergen', 'stage', 'q_cells']): + elif any(x in param_name_lower for x in ["weathergen", "stage", "q_cells"]): return -3 # Special marker for metadata - + return -1 # Unknown depth - + def plot_total_gradient_norms(self, figsize=(12, 6)): """Plot total gradient norm over training steps.""" # Calculate total norm per step total_norms = [] steps = [] - + for ith, entry in enumerate(self.data): # step = entry.get('num_samples', entry.get('epoch', 0)) step = ith * 5 - - if 'gradients' in entry: - grad_data = entry['gradients'] - elif 'grad_norms' in entry: - grad_data = entry['grad_norms'] + + if "gradients" in entry: + grad_data = entry["gradients"] + elif "grad_norms" in entry: + grad_data = entry["grad_norms"] else: - grad_data = {k: v for k, v in entry.items() - if 'grad_norm' in k} + grad_data = {k: v for k, v in entry.items() if "grad_norm" in k} if len(grad_data) == 0: continue - + # Calculate total norm (L2 norm of all gradients) - total_norm = np.sqrt(sum(float(v)**2 for v in grad_data.values())) + total_norm = np.sqrt(sum(float(v) ** 2 for v in grad_data.values())) total_norms.append(total_norm) steps.append(step) - + plt.figure(figsize=figsize) plt.plot(steps, total_norms, linewidth=1.5, alpha=0.8) - plt.xlabel('Training Step') - plt.ylabel('Total Gradient Norm') - plt.title('Total Gradient Norm vs Training Steps') - plt.yscale('log') + plt.xlabel("Training Step") + plt.ylabel("Total Gradient Norm") + plt.title("Total Gradient Norm vs Training Steps") + plt.yscale("log") plt.grid(True, alpha=0.3) plt.tight_layout() plt.savefig("plots/total_grad_norm.png") - + return steps, total_norms - + def plot_layer_type_norms(self, figsize=(14, 8)): """Plot gradient norms grouped by layer type.""" if self.df is None: print("No DataFrame available. Load data first.") return - + plt.figure(figsize=figsize) - + # Get unique layer types - layer_types = self.df['layer_type'].unique() + layer_types = self.df["layer_type"].unique() print(layer_types) colors = plt.cm.tab10(np.linspace(0, 1, len(layer_types))) - + for i, layer_type in enumerate(layer_types): - layer_data = self.df[self.df['layer_type'] == layer_type] - + layer_data = self.df[self.df["layer_type"] == layer_type] + # Calculate mean gradient norm per step for this layer type - mean_norms = layer_data.groupby('num_samples')['grad_norm'].mean() - - plt.plot(mean_norms.index, mean_norms.values, - label=layer_type, color=colors[i], alpha=0.8) - - plt.xlabel('Training Step') - plt.ylabel('Mean Gradient Norm') - plt.title('Gradient Norms by Layer Type') - plt.yscale('log') - plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') + mean_norms = layer_data.groupby("num_samples")["grad_norm"].mean() + + plt.plot( + mean_norms.index, mean_norms.values, label=layer_type, color=colors[i], alpha=0.8 + ) + + plt.xlabel("Training Step") + plt.ylabel("Mean Gradient Norm") + plt.title("Gradient Norms by Layer Type") + plt.yscale("log") + plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left") plt.grid(True, alpha=0.3) plt.tight_layout() plt.savefig("plots/grad_norm_by_layer_type.png") - + def plot_layer_depth_analysis(self, figsize=(12, 8)): """Plot gradient norms by layer depth.""" if self.df is None: print("No DataFrame available. Load data first.") return - + # Filter out unknown depths - depth_data = self.df[self.df['layer_depth'] >= 0] - + depth_data = self.df[self.df["layer_depth"] >= 0] + if len(depth_data) == 0: print("No layer depth information found in parameter names.") return - + fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize) - + # Plot 1: Mean gradient norm by depth over time - depths = sorted(depth_data['layer_depth'].unique()) + depths = sorted(depth_data["layer_depth"].unique()) colors = plt.cm.viridis(np.linspace(0, 1, len(depths))) - + for i, depth in enumerate(depths): - layer_data = depth_data[depth_data['layer_depth'] == depth] - mean_norms = layer_data.groupby('num_samples')['grad_norm'].mean() - - ax1.plot(mean_norms.index, mean_norms.values, - label=f'Layer {depth}', color=colors[i], alpha=0.8) - - ax1.set_xlabel('Training Step') - ax1.set_ylabel('Mean Gradient Norm') - ax1.set_title('Gradient Norms by Layer Depth') - ax1.set_yscale('log') - ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left') + layer_data = depth_data[depth_data["layer_depth"] == depth] + mean_norms = layer_data.groupby("num_samples")["grad_norm"].mean() + + ax1.plot( + mean_norms.index, + mean_norms.values, + label=f"Layer {depth}", + color=colors[i], + alpha=0.8, + ) + + ax1.set_xlabel("Training Step") + ax1.set_ylabel("Mean Gradient Norm") + ax1.set_title("Gradient Norms by Layer Depth") + ax1.set_yscale("log") + ax1.legend(bbox_to_anchor=(1.05, 1), loc="upper left") ax1.grid(True, alpha=0.3) - + # Plot 2: Heatmap of gradient norms by depth and step - pivot_data = depth_data.groupby(['num_samples', 'layer_depth'])['grad_norm'].mean().unstack() - + pivot_data = ( + depth_data.groupby(["num_samples", "layer_depth"])["grad_norm"].mean().unstack() + ) + # Sample data if too many steps for readability if len(pivot_data) > 100: - sample_idx = np.linspace(0, len(pivot_data)-1, 100, dtype=int) + sample_idx = np.linspace(0, len(pivot_data) - 1, 100, dtype=int) pivot_data = pivot_data.iloc[sample_idx] - - im = ax2.imshow(pivot_data.T, aspect='auto', cmap='viridis', - extent=[pivot_data.index.min(), pivot_data.index.max(), - pivot_data.columns.min(), pivot_data.columns.max()]) - ax2.set_xlabel('Training Step') - ax2.set_ylabel('Layer Depth') - ax2.set_title('Gradient Norm Heatmap (Layer Depth vs Step)') - + + im = ax2.imshow( + pivot_data.T, + aspect="auto", + cmap="viridis", + extent=[ + pivot_data.index.min(), + pivot_data.index.max(), + pivot_data.columns.min(), + pivot_data.columns.max(), + ], + ) + ax2.set_xlabel("Training Step") + ax2.set_ylabel("Layer Depth") + ax2.set_title("Gradient Norm Heatmap (Layer Depth vs Step)") + cbar = plt.colorbar(im, ax=ax2) - cbar.set_label('Gradient Norm') - + cbar.set_label("Gradient Norm") + plt.tight_layout() plt.savefig("plots/grad_norm_heatmap.png") - + def plot_gradient_distribution(self, figsize=(15, 10)): """Plot distribution of gradient norms.""" if self.df is None: print("No DataFrame available. Load data first.") return - + fig, axes = plt.subplots(2, 2, figsize=figsize) - + # Plot 1: Histogram of all gradient norms - axes[0, 0].hist(np.log10(self.df['grad_norm'].values), bins=50, alpha=0.7) - axes[0, 0].set_xlabel('Log10(Gradient Norm)') - axes[0, 0].set_ylabel('Frequency') - axes[0, 0].set_title('Distribution of Gradient Norms (Log Scale)') + axes[0, 0].hist(np.log10(self.df["grad_norm"].values), bins=50, alpha=0.7) + axes[0, 0].set_xlabel("Log10(Gradient Norm)") + axes[0, 0].set_ylabel("Frequency") + axes[0, 0].set_title("Distribution of Gradient Norms (Log Scale)") axes[0, 0].grid(True, alpha=0.3) - + # Plot 2: Box plot by layer type - layer_types = self.df['layer_type'].unique()[:10] # Limit to 10 for readability - plot_data = [np.log10(self.df[self.df['layer_type'] == lt]['grad_norm'].values) - for lt in layer_types] - + layer_types = self.df["layer_type"].unique()[:10] # Limit to 10 for readability + plot_data = [ + np.log10(self.df[self.df["layer_type"] == lt]["grad_norm"].values) for lt in layer_types + ] + axes[0, 1].boxplot(plot_data, labels=layer_types) - axes[0, 1].set_xlabel('Layer Type') - axes[0, 1].set_ylabel('Log10(Gradient Norm)') - axes[0, 1].set_title('Gradient Norm Distribution by Layer Type') - axes[0, 1].tick_params(axis='x', rotation=45) + axes[0, 1].set_xlabel("Layer Type") + axes[0, 1].set_ylabel("Log10(Gradient Norm)") + axes[0, 1].set_title("Gradient Norm Distribution by Layer Type") + axes[0, 1].tick_params(axis="x", rotation=45) axes[0, 1].grid(True, alpha=0.3) - + # Plot 3: Gradient norms over time (sample of parameters) - sample_params = self.df['parameter'].unique()[:20] # Sample 20 parameters + sample_params = self.df["parameter"].unique()[:20] # Sample 20 parameters for param in sample_params: - param_data = self.df[self.df['parameter'] == param] - axes[1, 0].plot(param_data['num_samples'], param_data['grad_norm'], - alpha=0.6, linewidth=0.8) - - axes[1, 0].set_xlabel('Training Step') - axes[1, 0].set_ylabel('Gradient Norm') - axes[1, 0].set_title('Individual Parameter Gradient Norms (Sample)') - axes[1, 0].set_yscale('log') + param_data = self.df[self.df["parameter"] == param] + axes[1, 0].plot( + param_data["num_samples"], param_data["grad_norm"], alpha=0.6, linewidth=0.8 + ) + + axes[1, 0].set_xlabel("Training Step") + axes[1, 0].set_ylabel("Gradient Norm") + axes[1, 0].set_title("Individual Parameter Gradient Norms (Sample)") + axes[1, 0].set_yscale("log") axes[1, 0].grid(True, alpha=0.3) - + # Plot 4: Statistics over time - stats_by_step = self.df.groupby('num_samples')['grad_norm'].agg(['mean', 'std', 'min', 'max']) - - axes[1, 1].fill_between(stats_by_step.index, - stats_by_step['mean'] - stats_by_step['std'], - stats_by_step['mean'] + stats_by_step['std'], - alpha=0.3, label='±1 std') - axes[1, 1].plot(stats_by_step.index, stats_by_step['mean'], - label='Mean', linewidth=2) - axes[1, 1].plot(stats_by_step.index, stats_by_step['max'], - label='Max', linewidth=1, alpha=0.8) - axes[1, 1].plot(stats_by_step.index, stats_by_step['min'], - label='Min', linewidth=1, alpha=0.8) - - axes[1, 1].set_xlabel('Training Step') - axes[1, 1].set_ylabel('Gradient Norm') - axes[1, 1].set_title('Gradient Norm Statistics Over Time') - axes[1, 1].set_yscale('log') + stats_by_step = self.df.groupby("num_samples")["grad_norm"].agg( + ["mean", "std", "min", "max"] + ) + + axes[1, 1].fill_between( + stats_by_step.index, + stats_by_step["mean"] - stats_by_step["std"], + stats_by_step["mean"] + stats_by_step["std"], + alpha=0.3, + label="±1 std", + ) + axes[1, 1].plot(stats_by_step.index, stats_by_step["mean"], label="Mean", linewidth=2) + axes[1, 1].plot( + stats_by_step.index, stats_by_step["max"], label="Max", linewidth=1, alpha=0.8 + ) + axes[1, 1].plot( + stats_by_step.index, stats_by_step["min"], label="Min", linewidth=1, alpha=0.8 + ) + + axes[1, 1].set_xlabel("Training Step") + axes[1, 1].set_ylabel("Gradient Norm") + axes[1, 1].set_title("Gradient Norm Statistics Over Time") + axes[1, 1].set_yscale("log") axes[1, 1].legend() axes[1, 1].grid(True, alpha=0.3) - + plt.tight_layout() plt.savefig("plots/grad_norm_over_time.png") - + def generate_summary_report(self): """Generate a summary report of gradient norm statistics.""" if self.df is None: print("No DataFrame available. Load data first.") return - + print("=== GRADIENT NORMS ANALYSIS REPORT ===") print(f"Total data points: {len(self.df)}") print(f"Training steps: {self.df['num_samples'].nunique()}") print(f"Unique parameters: {self.df['parameter'].nunique()}") print() - + print("Overall Statistics:") print(f"Mean gradient norm: {self.df['grad_norm'].mean():.6f}") print(f"Median gradient norm: {self.df['grad_norm'].median():.6f}") print(f"Min gradient norm: {self.df['grad_norm'].min():.6f}") print(f"Max gradient norm: {self.df['grad_norm'].max():.6f}") print() - + print("Statistics by Layer Type:") - layer_stats = self.df.groupby('layer_type')['grad_norm'].agg(['count', 'mean', 'std', 'min', 'max']) + layer_stats = self.df.groupby("layer_type")["grad_norm"].agg( + ["count", "mean", "std", "min", "max"] + ) print(layer_stats) print() - + # Check for potential issues print("Potential Issues:") - very_small = (self.df['grad_norm'] < 1e-6).sum() - very_large = (self.df['grad_norm'] > 10.0).sum() - + very_small = (self.df["grad_norm"] < 1e-6).sum() + very_large = (self.df["grad_norm"] > 10.0).sum() + if very_small > 0: print(f"⚠️ {very_small} gradient norms < 1e-6 (possible vanishing gradients)") if very_large > 0: print(f"⚠️ {very_large} gradient norms > 10.0 (possible exploding gradients)") - + if very_small == 0 and very_large == 0: print("✅ No obvious gradient issues detected") + # Usage example def analyze_gradient_file(json_file_path): """ Main function to analyze gradient norms from a JSON file. - + Usage: analyze_gradient_file('gradient_norms.jsonl') """ - + analyzer = GradientNormsAnalyzer(json_file_path) - + # Generate summary report analyzer.generate_summary_report() - + # Create all plots print("\n=== GENERATING PLOTS ===") - + print("1. Total gradient norms over time...") analyzer.plot_total_gradient_norms() - + print("2. Gradient norms by layer type...") analyzer.plot_layer_type_norms() - + print("3. Layer depth analysis...") analyzer.plot_layer_depth_analysis() - + print("4. Gradient distribution analysis...") analyzer.plot_gradient_distribution() - + return analyzer + # Example usage: # uv run python src/weathergen/utils/plot_grad_norms.py results/yvhxm2jc/yvhxm2jc_train_metrics.json -if __name__ == '__main__': +if __name__ == "__main__": import sys + analyzer = analyze_gradient_file(sys.argv[1]) From 754d31c660d2fb6f40e285b59f5630971c519d73 Mon Sep 17 00:00:00 2001 From: Julian Kuehnert Date: Wed, 8 Oct 2025 14:22:12 +0000 Subject: [PATCH 07/19] forecast config with small decoder --- config/default_config.yml | 29 +++++++++++++++-------------- config/streams/era5_1deg/era5.yml | 2 +- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/config/default_config.yml b/config/default_config.yml index 2ecf4f6b8..dde6fafbc 100644 --- a/config/default_config.yml +++ b/config/default_config.yml @@ -10,7 +10,7 @@ embed_dropout_rate: 0.1 target_cell_local_prediction: True ae_local_dim_embed: 1024 -ae_local_num_blocks: 2 +ae_local_num_blocks: 0 ae_local_num_heads: 16 ae_local_dropout_rate: 0.1 ae_local_with_qk_lnorm: True @@ -24,7 +24,7 @@ ae_adapter_with_residual: True ae_adapter_dropout_rate: 0.1 ae_global_dim_embed: 2048 -ae_global_num_blocks: 8 +ae_global_num_blocks: 4 ae_global_num_heads: 32 ae_global_dropout_rate: 0.1 ae_global_with_qk_lnorm: True @@ -34,18 +34,19 @@ ae_global_mlp_hidden_factor: 2 decoder_type: PerceiverIOCoordConditioning # CrossAttentionAdaNormConditioning pred_adapter_kv: False -pred_self_attention: True +pred_self_attention: False pred_dyadic_dims: False pred_mlp_adaln: True # number of steps offset applied to first target window; if set to zero and forecast_steps=0 then # one is training an auto-encoder -forecast_offset : 0 +forecast_offset : 1 forecast_delta_hrs: 0 -forecast_steps: 0 -forecast_policy: null +forecast_steps: 2 +forecast_policy: "fixed" +forecast_freeze_model: False forecast_att_dense_rate: 1.0 -fe_num_blocks: 0 +fe_num_blocks: 8 fe_num_heads: 16 fe_dropout_rate: 0.1 fe_with_qk_lnorm: True @@ -85,7 +86,7 @@ freeze_modules: "" # training mode: "forecast" or "masking" (masked token modeling) # for "masking" to train with auto-encoder mode, forecast_offset should be 0 -training_mode: "masking" +training_mode: "forecast" # masking rate when training mode is "masking"; ignored in foreacast mode masking_rate: 0.6 # sample the masking rate (with normal distribution centered at masking_rate) @@ -93,7 +94,7 @@ masking_rate: 0.6 masking_rate_sampling: True # sample a subset of all target points, useful e.g. to reduce memory requirements (also can specify per-stream) sampling_rate_target: 1.0 -# include a masking strategy here, currently only supporting "random", "block", "healpix", "channel", "causal" and "combination" +# include a masking strategy here, currently only supporting "random", "block", "healpix", "channel", "combination" masking_strategy: "random" # masking_strategy_config is a dictionary of additional parameters for the masking strategy # required for "healpix" and "channel" masking strategies @@ -105,17 +106,17 @@ masking_strategy_config: {"strategies": ["random", "healpix", "channel"], "same_strategy_per_batch": false } -num_epochs: 32 +num_epochs: 64 samples_per_epoch: 4096 samples_per_validation: 512 shuffle: True lr_scaling_policy: "sqrt" lr_start: 1e-6 -lr_max: 5e-5 -lr_final_decay: 1e-6 +lr_max: 0.0001 +lr_final_decay: 2e-6 lr_final: 0.0 -lr_steps_warmup: 512 +lr_steps_warmup: 256 lr_steps_cooldown: 512 lr_policy_warmup: "cosine" lr_policy_decay: "linear" @@ -151,4 +152,4 @@ run_id: ??? # Parameters for logging/printing in the training loop train_log: # The period to log metrics (in number of batch steps) - log_interval: 20 + log_interval: 20 \ No newline at end of file diff --git a/config/streams/era5_1deg/era5.yml b/config/streams/era5_1deg/era5.yml index a03bb3b40..aaf1bbf53 100644 --- a/config/streams/era5_1deg/era5.yml +++ b/config/streams/era5_1deg/era5.yml @@ -29,7 +29,7 @@ ERA5 : dim_embed : 256 target_readout : type : 'obs_value' # token or obs_value - num_layers : 2 + num_layers : 1 num_heads : 4 # sampling_rate : 0.2 pred_head : From 7c756a3544c91e4f59962f8e1ae6290cf20a45ba Mon Sep 17 00:00:00 2001 From: Julian Kuehnert Date: Thu, 9 Oct 2025 08:38:14 +0000 Subject: [PATCH 08/19] fixed uv.lock --- uv.lock | 292 +++++++++----------------------------------------------- 1 file changed, 44 insertions(+), 248 deletions(-) diff --git a/uv.lock b/uv.lock index 56e875859..79a5b2e2f 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = "==3.12.*" resolution-markers = [ "platform_machine == 'aarch64' and sys_platform == 'linux'", @@ -874,7 +874,7 @@ name = "jinja2" version = "3.1.6" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "markupsafe", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" }, + { name = "markupsafe", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" } wheels = [ @@ -1251,52 +1251,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c2/1c/6d343e030815c7c97a1f9fbad00211b47717c7fe446834c224bd5311e6f1/numpy-2.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:bd8df082b6c4695753ad6193018c05aac465d634834dca47a3ae06d4bb22d9ea", size = 9891498, upload-time = "2025-06-07T14:43:36.332Z" }, ] -[[package]] -name = "nvidia-cublas-cu12" -version = "12.4.5.8" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/7f/7f/7fbae15a3982dc9595e49ce0f19332423b260045d0a6afe93cdbe2f1f624/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3", size = 363333771, upload-time = "2024-06-18T19:28:09.881Z" }, - { url = "https://files.pythonhosted.org/packages/ae/71/1c91302526c45ab494c23f61c7a84aa568b8c1f9d196efa5993957faf906/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b", size = 363438805, upload-time = "2024-04-03T20:57:06.025Z" }, - { url = "https://files.pythonhosted.org/packages/e2/2a/4f27ca96232e8b5269074a72e03b4e0d43aa68c9b965058b1684d07c6ff8/nvidia_cublas_cu12-12.4.5.8-py3-none-win_amd64.whl", hash = "sha256:5a796786da89203a0657eda402bcdcec6180254a8ac22d72213abc42069522dc", size = 396895858, upload-time = "2024-04-03T21:03:31.996Z" }, -] - [[package]] name = "nvidia-cublas-cu12" version = "12.6.4.1" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", -] wheels = [ { url = "https://files.pythonhosted.org/packages/af/eb/ff4b8c503fa1f1796679dce648854d58751982426e4e4b37d6fce49d259c/nvidia_cublas_cu12-12.6.4.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:08ed2686e9875d01b58e3cb379c6896df8e76c75e0d4a7f7dace3d7b6d9ef8eb", size = 393138322, upload-time = "2024-11-20T17:40:25.65Z" }, { url = "https://files.pythonhosted.org/packages/97/0d/f1f0cadbf69d5b9ef2e4f744c9466cb0a850741d08350736dfdb4aa89569/nvidia_cublas_cu12-12.6.4.1-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:235f728d6e2a409eddf1df58d5b0921cf80cfa9e72b9f2775ccb7b4a87984668", size = 390794615, upload-time = "2024-11-20T17:39:52.715Z" }, { url = "https://files.pythonhosted.org/packages/84/f7/985e9bdbe3e0ac9298fcc8cfa51a392862a46a0ffaccbbd56939b62a9c83/nvidia_cublas_cu12-12.6.4.1-py3-none-win_amd64.whl", hash = "sha256:9e4fa264f4d8a4eb0cdbd34beadc029f453b3bafae02401e999cf3d5a5af75f8", size = 434535301, upload-time = "2024-11-20T17:50:41.681Z" }, ] -[[package]] -name = "nvidia-cuda-cupti-cu12" -version = "12.4.127" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/93/b5/9fb3d00386d3361b03874246190dfec7b206fd74e6e287b26a8fcb359d95/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a", size = 12354556, upload-time = "2024-06-18T19:30:40.546Z" }, - { url = "https://files.pythonhosted.org/packages/67/42/f4f60238e8194a3106d06a058d494b18e006c10bb2b915655bd9f6ea4cb1/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb", size = 13813957, upload-time = "2024-04-03T20:55:01.564Z" }, - { url = "https://files.pythonhosted.org/packages/f3/79/8cf313ec17c58ccebc965568e5bcb265cdab0a1df99c4e674bb7a3b99bfe/nvidia_cuda_cupti_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:5688d203301ab051449a2b1cb6690fbe90d2b372f411521c86018b950f3d7922", size = 9938035, upload-time = "2024-04-03T21:01:01.109Z" }, -] - [[package]] name = "nvidia-cuda-cupti-cu12" version = "12.6.80" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", -] wheels = [ { url = "https://files.pythonhosted.org/packages/e6/8b/2f6230cb715646c3a9425636e513227ce5c93c4d65823a734f4bb86d43c3/nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:166ee35a3ff1587f2490364f90eeeb8da06cd867bd5b701bf7f9a02b78bc63fc", size = 8236764, upload-time = "2024-11-20T17:35:41.03Z" }, { url = "https://files.pythonhosted.org/packages/25/0f/acb326ac8fd26e13c799e0b4f3b2751543e1834f04d62e729485872198d4/nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_aarch64.whl", hash = "sha256:358b4a1d35370353d52e12f0a7d1769fc01ff74a191689d3870b2123156184c4", size = 8236756, upload-time = "2024-10-01T16:57:45.507Z" }, @@ -1305,52 +1273,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1c/81/7796f096afaf726796b1b648f3bc80cafc61fe7f77f44a483c89e6c5ef34/nvidia_cuda_cupti_cu12-12.6.80-py3-none-win_amd64.whl", hash = "sha256:bbe6ae76e83ce5251b56e8c8e61a964f757175682bbad058b170b136266ab00a", size = 5724175, upload-time = "2024-10-01T17:09:47.955Z" }, ] -[[package]] -name = "nvidia-cuda-nvrtc-cu12" -version = "12.4.127" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/77/aa/083b01c427e963ad0b314040565ea396f914349914c298556484f799e61b/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198", size = 24133372, upload-time = "2024-06-18T19:32:00.576Z" }, - { url = "https://files.pythonhosted.org/packages/2c/14/91ae57cd4db3f9ef7aa99f4019cfa8d54cb4caa7e00975df6467e9725a9f/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338", size = 24640306, upload-time = "2024-04-03T20:56:01.463Z" }, - { url = "https://files.pythonhosted.org/packages/7c/30/8c844bfb770f045bcd8b2c83455c5afb45983e1a8abf0c4e5297b481b6a5/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:a961b2f1d5f17b14867c619ceb99ef6fcec12e46612711bcec78eb05068a60ec", size = 19751955, upload-time = "2024-04-03T21:01:51.133Z" }, -] - [[package]] name = "nvidia-cuda-nvrtc-cu12" version = "12.6.77" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", -] wheels = [ { url = "https://files.pythonhosted.org/packages/f4/2f/72df534873235983cc0a5371c3661bebef7c4682760c275590b972c7b0f9/nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5847f1d6e5b757f1d2b3991a01082a44aad6f10ab3c5c0213fa3e25bddc25a13", size = 23162955, upload-time = "2024-10-01T16:59:50.922Z" }, { url = "https://files.pythonhosted.org/packages/75/2e/46030320b5a80661e88039f59060d1790298b4718944a65a7f2aeda3d9e9/nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:35b0cc6ee3a9636d5409133e79273ce1f3fd087abb0532d2d2e8fff1fe9efc53", size = 23650380, upload-time = "2024-10-01T17:00:14.643Z" }, { url = "https://files.pythonhosted.org/packages/f5/46/d3a1cdda8bb113c80f43a0a6f3a853356d487b830f3483f92d49ce87fa55/nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-win_amd64.whl", hash = "sha256:f7007dbd914c56bd80ea31bc43e8e149da38f68158f423ba845fc3292684e45a", size = 39026742, upload-time = "2024-10-01T17:10:49.058Z" }, ] -[[package]] -name = "nvidia-cuda-runtime-cu12" -version = "12.4.127" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/a1/aa/b656d755f474e2084971e9a297def515938d56b466ab39624012070cb773/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3", size = 894177, upload-time = "2024-06-18T19:32:52.877Z" }, - { url = "https://files.pythonhosted.org/packages/ea/27/1795d86fe88ef397885f2e580ac37628ed058a92ed2c39dc8eac3adf0619/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5", size = 883737, upload-time = "2024-04-03T20:54:51.355Z" }, - { url = "https://files.pythonhosted.org/packages/a8/8b/450e93fab75d85a69b50ea2d5fdd4ff44541e0138db16f9cd90123ef4de4/nvidia_cuda_runtime_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:09c2e35f48359752dfa822c09918211844a3d93c100a715d79b59591130c5e1e", size = 878808, upload-time = "2024-04-03T21:00:49.77Z" }, -] - [[package]] name = "nvidia-cuda-runtime-cu12" version = "12.6.77" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", -] wheels = [ { url = "https://files.pythonhosted.org/packages/8f/ea/590b2ac00d772a8abd1c387a92b46486d2679ca6622fd25c18ff76265663/nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6116fad3e049e04791c0256a9778c16237837c08b27ed8c8401e2e45de8d60cd", size = 908052, upload-time = "2024-11-20T17:35:19.905Z" }, { url = "https://files.pythonhosted.org/packages/b7/3d/159023799677126e20c8fd580cca09eeb28d5c5a624adc7f793b9aa8bbfa/nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d461264ecb429c84c8879a7153499ddc7b19b5f8d84c204307491989a365588e", size = 908040, upload-time = "2024-10-01T16:57:22.221Z" }, @@ -1359,30 +1295,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fa/76/4c80fa138333cc975743fd0687a745fccb30d167f906f13c1c7f9a85e5ea/nvidia_cuda_runtime_cu12-12.6.77-py3-none-win_amd64.whl", hash = "sha256:86c58044c824bf3c173c49a2dbc7a6c8b53cb4e4dca50068be0bf64e9dab3f7f", size = 891773, upload-time = "2024-10-01T17:09:26.362Z" }, ] -[[package]] -name = "nvidia-cudnn-cu12" -version = "9.1.0.70" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", -] -dependencies = [ - { name = "nvidia-cublas-cu12", version = "12.4.5.8", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/9f/fd/713452cd72343f682b1c7b9321e23829f00b842ceaedcda96e742ea0b0b3/nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f", size = 664752741, upload-time = "2024-04-22T15:24:15.253Z" }, - { url = "https://files.pythonhosted.org/packages/3f/d0/f90ee6956a628f9f04bf467932c0a25e5a7e706a684b896593c06c82f460/nvidia_cudnn_cu12-9.1.0.70-py3-none-win_amd64.whl", hash = "sha256:6278562929433d68365a07a4a1546c237ba2849852c0d4b2262a486e805b977a", size = 679925892, upload-time = "2024-04-22T15:24:53.333Z" }, -] - [[package]] name = "nvidia-cudnn-cu12" version = "9.5.1.17" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", -] dependencies = [ - { name = "nvidia-cublas-cu12", version = "12.6.4.1", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/99/93/a201a12d3ec1caa8c6ac34c1c2f9eeb696b886f0c36ff23c638b46603bd0/nvidia_cudnn_cu12-9.5.1.17-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:9fd4584468533c61873e5fda8ca41bac3a38bcb2d12350830c69b0a96a7e4def", size = 570523509, upload-time = "2024-10-25T19:53:03.148Z" }, @@ -1390,31 +1308,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b6/b2/3f60d15f037fa5419d9d7f788b100ef33ea913ae5315c87ca6d6fa606c35/nvidia_cudnn_cu12-9.5.1.17-py3-none-win_amd64.whl", hash = "sha256:d7af0f8a4f3b4b9dbb3122f2ef553b45694ed9c384d5a75bab197b8eefb79ab8", size = 565440743, upload-time = "2024-10-25T19:55:49.74Z" }, ] -[[package]] -name = "nvidia-cufft-cu12" -version = "11.2.1.3" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", -] -dependencies = [ - { name = "nvidia-nvjitlink-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/7a/8a/0e728f749baca3fbeffad762738276e5df60851958be7783af121a7221e7/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399", size = 211422548, upload-time = "2024-06-18T19:33:39.396Z" }, - { url = "https://files.pythonhosted.org/packages/27/94/3266821f65b92b3138631e9c8e7fe1fb513804ac934485a8d05776e1dd43/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9", size = 211459117, upload-time = "2024-04-03T20:57:40.402Z" }, - { url = "https://files.pythonhosted.org/packages/f6/ee/3f3f8e9874f0be5bbba8fb4b62b3de050156d159f8b6edc42d6f1074113b/nvidia_cufft_cu12-11.2.1.3-py3-none-win_amd64.whl", hash = "sha256:d802f4954291101186078ccbe22fc285a902136f974d369540fd4a5333d1440b", size = 210576476, upload-time = "2024-04-03T21:04:06.422Z" }, -] - [[package]] name = "nvidia-cufft-cu12" version = "11.3.0.4" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", -] dependencies = [ - { name = "nvidia-nvjitlink-cu12", version = "12.6.85", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/1f/37/c50d2b2f2c07e146776389e3080f4faf70bcc4fa6e19d65bb54ca174ebc3/nvidia_cufft_cu12-11.3.0.4-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d16079550df460376455cba121db6564089176d9bac9e4f360493ca4741b22a6", size = 200164144, upload-time = "2024-11-20T17:40:58.288Z" }, @@ -1424,26 +1323,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b4/38/36fd800cec8f6e89b7c1576edaaf8076e69ec631644cdbc1b5f2e2b5a9df/nvidia_cufft_cu12-11.3.0.4-py3-none-win_amd64.whl", hash = "sha256:6048ebddfb90d09d2707efb1fd78d4e3a77cb3ae4dc60e19aab6be0ece2ae464", size = 199356881, upload-time = "2024-10-01T17:13:01.861Z" }, ] -[[package]] -name = "nvidia-curand-cu12" -version = "10.3.5.147" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/80/9c/a79180e4d70995fdf030c6946991d0171555c6edf95c265c6b2bf7011112/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9", size = 56314811, upload-time = "2024-06-18T19:34:48.575Z" }, - { url = "https://files.pythonhosted.org/packages/8a/6d/44ad094874c6f1b9c654f8ed939590bdc408349f137f9b98a3a23ccec411/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b", size = 56305206, upload-time = "2024-04-03T20:58:08.722Z" }, - { url = "https://files.pythonhosted.org/packages/1c/22/2573503d0d4e45673c263a313f79410e110eb562636b0617856fdb2ff5f6/nvidia_curand_cu12-10.3.5.147-py3-none-win_amd64.whl", hash = "sha256:f307cc191f96efe9e8f05a87096abc20d08845a841889ef78cb06924437f6771", size = 55799918, upload-time = "2024-04-03T21:04:34.45Z" }, -] - [[package]] name = "nvidia-curand-cu12" version = "10.3.7.77" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", -] wheels = [ { url = "https://files.pythonhosted.org/packages/42/ac/36543605358a355632f1a6faa3e2d5dfb91eab1e4bc7d552040e0383c335/nvidia_curand_cu12-10.3.7.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:6e82df077060ea28e37f48a3ec442a8f47690c7499bff392a5938614b56c98d8", size = 56289881, upload-time = "2024-10-01T17:04:18.981Z" }, { url = "https://files.pythonhosted.org/packages/73/1b/44a01c4e70933637c93e6e1a8063d1e998b50213a6b65ac5a9169c47e98e/nvidia_curand_cu12-10.3.7.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a42cd1344297f70b9e39a1e4f467a4e1c10f1da54ff7a85c12197f6c652c8bdf", size = 56279010, upload-time = "2024-11-20T17:42:50.958Z" }, @@ -1452,35 +1335,14 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a9/a8/0cd0cec757bd4b4b4ef150fca62ec064db7d08a291dced835a0be7d2c147/nvidia_curand_cu12-10.3.7.77-py3-none-win_amd64.whl", hash = "sha256:6d6d935ffba0f3d439b7cd968192ff068fafd9018dbf1b85b37261b13cfc9905", size = 55783873, upload-time = "2024-10-01T17:13:30.377Z" }, ] -[[package]] -name = "nvidia-cusolver-cu12" -version = "11.6.1.9" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", -] -dependencies = [ - { name = "nvidia-cublas-cu12", version = "12.4.5.8", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusparse-cu12", version = "12.3.1.170", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nvjitlink-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/46/6b/a5c33cf16af09166845345275c34ad2190944bcc6026797a39f8e0a282e0/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e", size = 127634111, upload-time = "2024-06-18T19:35:01.793Z" }, - { url = "https://files.pythonhosted.org/packages/3a/e1/5b9089a4b2a4790dfdea8b3a006052cfecff58139d5a4e34cb1a51df8d6f/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260", size = 127936057, upload-time = "2024-04-03T20:58:28.735Z" }, - { url = "https://files.pythonhosted.org/packages/f2/be/d435b7b020e854d5d5a682eb5de4328fd62f6182507406f2818280e206e2/nvidia_cusolver_cu12-11.6.1.9-py3-none-win_amd64.whl", hash = "sha256:e77314c9d7b694fcebc84f58989f3aa4fb4cb442f12ca1a9bde50f5e8f6d1b9c", size = 125224015, upload-time = "2024-04-03T21:04:53.339Z" }, -] - [[package]] name = "nvidia-cusolver-cu12" version = "11.7.1.2" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", -] dependencies = [ - { name = "nvidia-cublas-cu12", version = "12.6.4.1", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusparse-cu12", version = "12.5.4.2", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nvjitlink-cu12", version = "12.6.85", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/93/17/dbe1aa865e4fdc7b6d4d0dd308fdd5aaab60f939abfc0ea1954eac4fb113/nvidia_cusolver_cu12-11.7.1.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0ce237ef60acde1efc457335a2ddadfd7610b892d94efee7b776c64bb1cac9e0", size = 157833628, upload-time = "2024-10-01T17:05:05.591Z" }, @@ -1490,31 +1352,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d4/53/fff50a0808df7113d77e3bbc7c2b7eaed6f57d5eb80fbe93ead2aea1e09a/nvidia_cusolver_cu12-11.7.1.2-py3-none-win_amd64.whl", hash = "sha256:6813f9d8073f555444a8705f3ab0296d3e1cb37a16d694c5fc8b862a0d8706d7", size = 149287877, upload-time = "2024-10-01T17:13:49.804Z" }, ] -[[package]] -name = "nvidia-cusparse-cu12" -version = "12.3.1.170" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", -] -dependencies = [ - { name = "nvidia-nvjitlink-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/96/a9/c0d2f83a53d40a4a41be14cea6a0bf9e668ffcf8b004bd65633f433050c0/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3", size = 207381987, upload-time = "2024-06-18T19:35:32.989Z" }, - { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763, upload-time = "2024-04-03T20:58:59.995Z" }, - { url = "https://files.pythonhosted.org/packages/a2/e0/3155ca539760a8118ec94cc279b34293309bcd14011fc724f87f31988843/nvidia_cusparse_cu12-12.3.1.170-py3-none-win_amd64.whl", hash = "sha256:9bc90fb087bc7b4c15641521f31c0371e9a612fc2ba12c338d3ae032e6b6797f", size = 204684315, upload-time = "2024-04-03T21:05:26.031Z" }, -] - [[package]] name = "nvidia-cusparse-cu12" version = "12.5.4.2" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", -] dependencies = [ - { name = "nvidia-nvjitlink-cu12", version = "12.6.85", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/eb/eb/6681efd0aa7df96b4f8067b3ce7246833dd36830bb4cec8896182773db7d/nvidia_cusparse_cu12-12.5.4.2-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d25b62fb18751758fe3c93a4a08eff08effedfe4edf1c6bb5afd0890fe88f887", size = 216451147, upload-time = "2024-11-20T17:44:18.055Z" }, @@ -1524,26 +1367,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/45/ef/876ad8e4260e1128e6d4aac803d9d51baf3791ebdb4a9b8d9b8db032b4b0/nvidia_cusparse_cu12-12.5.4.2-py3-none-win_amd64.whl", hash = "sha256:4acb8c08855a26d737398cba8fb6f8f5045d93f82612b4cfd84645a2332ccf20", size = 213712630, upload-time = "2024-10-01T17:14:23.779Z" }, ] -[[package]] -name = "nvidia-cusparselt-cu12" -version = "0.6.2" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/98/8e/675498726c605c9441cf46653bd29cb1b8666da1fb1469ffa25f67f20c58/nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:067a7f6d03ea0d4841c85f0c6f1991c5dda98211f6302cb83a4ab234ee95bef8", size = 149422781, upload-time = "2024-07-23T17:35:27.203Z" }, - { url = "https://files.pythonhosted.org/packages/78/a8/bcbb63b53a4b1234feeafb65544ee55495e1bb37ec31b999b963cbccfd1d/nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:df2c24502fd76ebafe7457dbc4716b2fec071aabaed4fb7691a201cde03704d9", size = 150057751, upload-time = "2024-07-23T02:35:53.074Z" }, - { url = "https://files.pythonhosted.org/packages/56/8f/2c33082238b6c5e783a877dc8786ab62619e3e6171c083bd3bba6e3fe75e/nvidia_cusparselt_cu12-0.6.2-py3-none-win_amd64.whl", hash = "sha256:0057c91d230703924c0422feabe4ce768841f9b4b44d28586b6f6d2eb86fbe70", size = 148755794, upload-time = "2024-07-23T02:35:00.261Z" }, -] - [[package]] name = "nvidia-cusparselt-cu12" version = "0.6.3" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", -] wheels = [ { url = "https://files.pythonhosted.org/packages/62/da/4de092c61c6dea1fc9c936e69308a02531d122e12f1f649825934ad651b5/nvidia_cusparselt_cu12-0.6.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:8371549623ba601a06322af2133c4a44350575f5a3108fb75f3ef20b822ad5f1", size = 156402859, upload-time = "2024-10-16T02:23:17.184Z" }, { url = "https://files.pythonhosted.org/packages/3b/9a/72ef35b399b0e183bc2e8f6f558036922d453c4d8237dab26c666a04244b/nvidia_cusparselt_cu12-0.6.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:e5c8a26c36445dd2e6812f1177978a24e2d37cacce7e090f297a688d1ec44f46", size = 156785796, upload-time = "2024-10-15T21:29:17.709Z" }, @@ -1567,52 +1394,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/df/99/12cd266d6233f47d00daf3a72739872bdc10267d0383508b0b9c84a18bb6/nvidia_nccl_cu12-2.21.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:8579076d30a8c24988834445f8d633c697d42397e92ffc3f63fa26766d25e0a0", size = 188654414, upload-time = "2024-04-03T15:32:57.427Z" }, ] -[[package]] -name = "nvidia-nvjitlink-cu12" -version = "12.4.127" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/02/45/239d52c05074898a80a900f49b1615d81c07fceadd5ad6c4f86a987c0bc4/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83", size = 20552510, upload-time = "2024-06-18T20:20:13.871Z" }, - { url = "https://files.pythonhosted.org/packages/ff/ff/847841bacfbefc97a00036e0fce5a0f086b640756dc38caea5e1bb002655/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57", size = 21066810, upload-time = "2024-04-03T20:59:46.957Z" }, - { url = "https://files.pythonhosted.org/packages/81/19/0babc919031bee42620257b9a911c528f05fb2688520dcd9ca59159ffea8/nvidia_nvjitlink_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:fd9020c501d27d135f983c6d3e244b197a7ccad769e34df53a42e276b0e25fa1", size = 95336325, upload-time = "2024-04-03T21:06:25.073Z" }, -] - [[package]] name = "nvidia-nvjitlink-cu12" version = "12.6.85" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", -] wheels = [ { url = "https://files.pythonhosted.org/packages/9d/d7/c5383e47c7e9bf1c99d5bd2a8c935af2b6d705ad831a7ec5c97db4d82f4f/nvidia_nvjitlink_cu12-12.6.85-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:eedc36df9e88b682efe4309aa16b5b4e78c2407eac59e8c10a6a47535164369a", size = 19744971, upload-time = "2024-11-20T17:46:53.366Z" }, { url = "https://files.pythonhosted.org/packages/31/db/dc71113d441f208cdfe7ae10d4983884e13f464a6252450693365e166dcf/nvidia_nvjitlink_cu12-12.6.85-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cf4eaa7d4b6b543ffd69d6abfb11efdeb2db48270d94dfd3a452c24150829e41", size = 19270338, upload-time = "2024-11-20T17:46:29.758Z" }, { url = "https://files.pythonhosted.org/packages/89/76/93c1467b1387387440a4d25102d86b7794535449b689f8e2dc22c1c8ff7f/nvidia_nvjitlink_cu12-12.6.85-py3-none-win_amd64.whl", hash = "sha256:e61120e52ed675747825cdd16febc6a0730537451d867ee58bee3853b1b13d1c", size = 161908572, upload-time = "2024-11-20T17:52:40.124Z" }, ] -[[package]] -name = "nvidia-nvtx-cu12" -version = "12.4.127" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/06/39/471f581edbb7804b39e8063d92fc8305bdc7a80ae5c07dbe6ea5c50d14a5/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3", size = 100417, upload-time = "2024-06-18T20:16:22.484Z" }, - { url = "https://files.pythonhosted.org/packages/87/20/199b8713428322a2f22b722c62b8cc278cc53dffa9705d744484b5035ee9/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a", size = 99144, upload-time = "2024-04-03T20:56:12.406Z" }, - { url = "https://files.pythonhosted.org/packages/54/1b/f77674fbb73af98843be25803bbd3b9a4f0a96c75b8d33a2854a5c7d2d77/nvidia_nvtx_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:641dccaaa1139f3ffb0d3164b4b84f9d253397e38246a4f2f36728b48566d485", size = 66307, upload-time = "2024-04-03T21:02:01.959Z" }, -] - [[package]] name = "nvidia-nvtx-cu12" version = "12.6.77" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "platform_machine == 'x86_64' and sys_platform == 'linux'", -] wheels = [ { url = "https://files.pythonhosted.org/packages/b9/93/80f8a520375af9d7ee44571a6544653a176e53c2b8ccce85b97b83c2491b/nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f44f8d86bb7d5629988d61c8d3ae61dddb2015dee142740536bc7481b022fe4b", size = 90549, upload-time = "2024-11-20T17:38:17.387Z" }, { url = "https://files.pythonhosted.org/packages/2b/53/36e2fd6c7068997169b49ffc8c12d5af5e5ff209df6e1a2c4d373b3a638f/nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:adcaabb9d436c9761fca2b13959a2d237c5f9fd406c8e4b723c695409ff88059", size = 90539, upload-time = "2024-10-01T17:00:27.179Z" }, @@ -2283,6 +2078,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e6/eb/3bf6ea8ab7f1503dca3a10df2e4b9c3f6b3316df07f6c0ded94b281c7101/scipy-1.15.3-cp312-cp312-win_amd64.whl", hash = "sha256:52092bc0472cfd17df49ff17e70624345efece4e1a12b23783a1ac59a1b728ed", size = 40966184, upload-time = "2025-05-08T16:06:52.623Z" }, ] +[[package]] +name = "seaborn" +version = "0.13.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "matplotlib", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" }, + { name = "numpy", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" }, + { name = "pandas", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/86/59/a451d7420a77ab0b98f7affa3a1d78a313d2f7281a57afb1a34bae8ab412/seaborn-0.13.2.tar.gz", hash = "sha256:93e60a40988f4d65e9f4885df477e2fdaff6b73a9ded434c1ab356dd57eefff7", size = 1457696, upload-time = "2024-01-25T13:21:52.551Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/83/11/00d3c3dfc25ad54e731d91449895a79e4bf2384dc3ac01809010ba88f6d5/seaborn-0.13.2-py3-none-any.whl", hash = "sha256:636f8336facf092165e27924f223d3c62ca560b1f2bb5dff7ab7fad265361987", size = 294914, upload-time = "2024-01-25T13:21:49.598Z" }, +] + [[package]] name = "semantic-version" version = "2.10.0" @@ -2426,8 +2235,8 @@ wheels = [ [[package]] name = "torch" -version = "2.6.0" -source = { registry = "https://pypi.org/simple" } +version = "2.6.0+cpu" +source = { registry = "https://download.pytorch.org/whl/cpu" } resolution-markers = [ "platform_machine == 'aarch64' and sys_platform == 'linux'", "platform_machine == 'x86_64' and sys_platform == 'linux'", @@ -2437,29 +2246,14 @@ dependencies = [ { name = "fsspec", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "jinja2", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "networkx", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, - { name = "nvidia-cublas-cu12", version = "12.4.5.8", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-cupti-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-nvrtc-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-runtime-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cudnn-cu12", version = "9.1.0.70", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cufft-cu12", version = "11.2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-curand-cu12", version = "10.3.5.147", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusolver-cu12", version = "11.6.1.9", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusparse-cu12", version = "12.3.1.170", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusparselt-cu12", version = "0.6.2", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nvjitlink-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nvtx-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "setuptools", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "sympy", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, - { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "typing-extensions", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/e5/35/0c52d708144c2deb595cd22819a609f78fdd699b95ff6f0ebcd456e3c7c1/torch-2.6.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:2bb8987f3bb1ef2675897034402373ddfc8f5ef0e156e2d8cfc47cacafdda4a9", size = 766624563, upload-time = "2025-01-29T16:23:19.084Z" }, - { url = "https://files.pythonhosted.org/packages/01/d6/455ab3fbb2c61c71c8842753b566012e1ed111e7a4c82e0e1c20d0c76b62/torch-2.6.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b789069020c5588c70d5c2158ac0aa23fd24a028f34a8b4fcb8fcb4d7efcf5fb", size = 95607867, upload-time = "2025-01-29T16:25:55.649Z" }, - { url = "https://files.pythonhosted.org/packages/18/cf/ae99bd066571656185be0d88ee70abc58467b76f2f7c8bfeb48735a71fe6/torch-2.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:7e1448426d0ba3620408218b50aa6ada88aeae34f7a239ba5431f6c8774b1239", size = 204120469, upload-time = "2025-01-29T16:24:01.821Z" }, - { url = "https://files.pythonhosted.org/packages/81/b4/605ae4173aa37fb5aa14605d100ff31f4f5d49f617928c9f486bb3aaec08/torch-2.6.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:9a610afe216a85a8b9bc9f8365ed561535c93e804c2a317ef7fabcc5deda0989", size = 66532538, upload-time = "2025-01-29T16:24:18.976Z" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp312-cp312-linux_x86_64.whl", hash = "sha256:59e78aa0c690f70734e42670036d6b541930b8eabbaa18d94e090abf14cc4d91" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:318290e8924353c61b125cdc8768d15208704e279e7757c113b9620740deca98" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp312-cp312-win_amd64.whl", hash = "sha256:4027d982eb2781c93825ab9527f17fbbb12dbabf422298e4b954be60016f87d8" }, ] [[package]] @@ -2508,19 +2302,19 @@ dependencies = [ { name = "fsspec", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "jinja2", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "networkx", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cublas-cu12", version = "12.6.4.1", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-cupti-cu12", version = "12.6.80", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-nvrtc-cu12", version = "12.6.77", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-runtime-cu12", version = "12.6.77", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cudnn-cu12", version = "9.5.1.17", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cufft-cu12", version = "11.3.0.4", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-curand-cu12", version = "10.3.7.77", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusolver-cu12", version = "11.7.1.2", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusparse-cu12", version = "12.5.4.2", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusparselt-cu12", version = "0.6.3", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nvjitlink-cu12", version = "12.6.85", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nvtx-cu12", version = "12.6.77", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "setuptools", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "sympy", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, @@ -2687,6 +2481,7 @@ dependencies = [ { name = "polars", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" }, { name = "psutil", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" }, { name = "pynvml", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" }, + { name = "seaborn", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" }, { name = "tqdm", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" }, { name = "weathergen-common", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" }, { name = "weathergen-evaluate", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" }, @@ -2696,7 +2491,7 @@ dependencies = [ [package.optional-dependencies] cpu = [ - { name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "torch", version = "2.6.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ] gpu = [ { name = "flash-attn", version = "2.7.3", source = { url = "https://object-store.os-api.cci1.ecmwf.int/weathergenerator-dev/wheels/flash_attn-2.7.3-cp312-cp312-linux_aarch64.whl" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'extra-10-weathergen-gpu') or (platform_machine != 'aarch64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" }, @@ -2735,11 +2530,12 @@ requires-dist = [ { name = "polars", specifier = "~=1.25.2" }, { name = "psutil" }, { name = "pynvml" }, + { name = "seaborn", specifier = ">=0.13.2" }, { name = "torch", marker = "platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'gpu'", url = "https://download.pytorch.org/whl/cu126/torch-2.6.0%2Bcu126-cp312-cp312-linux_aarch64.whl" }, { name = "torch", marker = "platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'gpu'", url = "https://download.pytorch.org/whl/cu126/torch-2.6.0%2Bcu126-cp312-cp312-manylinux_2_28_x86_64.whl" }, + { name = "torch", marker = "sys_platform == 'linux' and extra == 'cpu'", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cpu", conflict = { package = "weathergen", extra = "cpu" } }, + { name = "torch", marker = "sys_platform != 'linux' and extra == 'cpu'", specifier = "==2.6.0" }, { name = "torch", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'gpu') or (sys_platform != 'linux' and extra == 'gpu')", specifier = "==2.6.0+cu126" }, - { name = "torch", marker = "sys_platform == 'macosx' and extra == 'cpu'", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cpu", conflict = { package = "weathergen", extra = "cpu" } }, - { name = "torch", marker = "sys_platform != 'macosx' and extra == 'cpu'", specifier = "==2.6.0" }, { name = "tqdm" }, { name = "weathergen-common", editable = "packages/common" }, { name = "weathergen-evaluate", editable = "packages/evaluate" }, From 41716a670c0fbddbe96a3433210ff9d3cd717236 Mon Sep 17 00:00:00 2001 From: Julian Kuehnert Date: Thu, 9 Oct 2025 16:02:02 +0000 Subject: [PATCH 09/19] test gradient logging on mutli gpus --- config/default_config.yml | 2 +- src/weathergen/train/trainer.py | 21 ++++++++++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/config/default_config.yml b/config/default_config.yml index b14fddcba..d67d5359e 100644 --- a/config/default_config.yml +++ b/config/default_config.yml @@ -128,7 +128,7 @@ grad_clip: 1.0 weight_decay: 0.1 norm_type: "LayerNorm" nn_module: "te" -log_grad_norms: False +log_grad_norms: True start_date: 197901010000 end_date: 202012310000 diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py index 17f7e4433..83515d317 100644 --- a/src/weathergen/train/trainer.py +++ b/src/weathergen/train/trainer.py @@ -930,12 +930,23 @@ def _log_instant_grad_norms(self, stage: Stage, total_norm): TODO test DDP case """ - grad_norms = {"total_grad_norm": total_norm.item()} - self.last_grad_norm = total_norm.item() - for name, param in self.ddp_model.named_parameters(): + self.last_grad_norm = ( + total_norm.full_tensor().item() if self.cf.world_size > 1 else total_norm.item() + ) + grad_norms = {"total_grad_norm": self.last_grad_norm} + for name, param in self.model.named_parameters(): if param.grad is not None: - grad_norms["grad_norm_" + name] = param.grad.norm().item() - self.train_logger.log_metrics(TRAIN, grad_norms) + # grad_norms["grad_norm_" + name] = param.grad.norm().item() + grad_norms["grad_norm_" + name] = ( + param.grad.norm().full_tensor().item() + if self.cf.world_size > 1 + else param.grad.norm().item() + ) + + # print(".item():", param.grad.norm().item()) + # print(".full_tensor().item()", param.grad.norm().full_tensor().item()) + if is_root(): + self.train_logger.log_metrics(TRAIN, grad_norms) def _log_terminal(self, bidx: int, epoch: int, stage: Stage): if bidx % self.print_freq == 0 and bidx > 0 or stage == VAL: From 8bdbac41ec37c4ed18bb9e2fed54af90c20ca60f Mon Sep 17 00:00:00 2001 From: Julian Kuehnert Date: Mon, 13 Oct 2025 13:24:44 +0000 Subject: [PATCH 10/19] update uv.lock to latest develop version --- uv.lock | 292 +++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 248 insertions(+), 44 deletions(-) diff --git a/uv.lock b/uv.lock index 79a5b2e2f..56e875859 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = "==3.12.*" resolution-markers = [ "platform_machine == 'aarch64' and sys_platform == 'linux'", @@ -874,7 +874,7 @@ name = "jinja2" version = "3.1.6" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "markupsafe", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "markupsafe", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" } wheels = [ @@ -1251,20 +1251,52 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c2/1c/6d343e030815c7c97a1f9fbad00211b47717c7fe446834c224bd5311e6f1/numpy-2.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:bd8df082b6c4695753ad6193018c05aac465d634834dca47a3ae06d4bb22d9ea", size = 9891498, upload-time = "2025-06-07T14:43:36.332Z" }, ] +[[package]] +name = "nvidia-cublas-cu12" +version = "12.4.5.8" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/7f/7f/7fbae15a3982dc9595e49ce0f19332423b260045d0a6afe93cdbe2f1f624/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3", size = 363333771, upload-time = "2024-06-18T19:28:09.881Z" }, + { url = "https://files.pythonhosted.org/packages/ae/71/1c91302526c45ab494c23f61c7a84aa568b8c1f9d196efa5993957faf906/nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b", size = 363438805, upload-time = "2024-04-03T20:57:06.025Z" }, + { url = "https://files.pythonhosted.org/packages/e2/2a/4f27ca96232e8b5269074a72e03b4e0d43aa68c9b965058b1684d07c6ff8/nvidia_cublas_cu12-12.4.5.8-py3-none-win_amd64.whl", hash = "sha256:5a796786da89203a0657eda402bcdcec6180254a8ac22d72213abc42069522dc", size = 396895858, upload-time = "2024-04-03T21:03:31.996Z" }, +] + [[package]] name = "nvidia-cublas-cu12" version = "12.6.4.1" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] wheels = [ { url = "https://files.pythonhosted.org/packages/af/eb/ff4b8c503fa1f1796679dce648854d58751982426e4e4b37d6fce49d259c/nvidia_cublas_cu12-12.6.4.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:08ed2686e9875d01b58e3cb379c6896df8e76c75e0d4a7f7dace3d7b6d9ef8eb", size = 393138322, upload-time = "2024-11-20T17:40:25.65Z" }, { url = "https://files.pythonhosted.org/packages/97/0d/f1f0cadbf69d5b9ef2e4f744c9466cb0a850741d08350736dfdb4aa89569/nvidia_cublas_cu12-12.6.4.1-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:235f728d6e2a409eddf1df58d5b0921cf80cfa9e72b9f2775ccb7b4a87984668", size = 390794615, upload-time = "2024-11-20T17:39:52.715Z" }, { url = "https://files.pythonhosted.org/packages/84/f7/985e9bdbe3e0ac9298fcc8cfa51a392862a46a0ffaccbbd56939b62a9c83/nvidia_cublas_cu12-12.6.4.1-py3-none-win_amd64.whl", hash = "sha256:9e4fa264f4d8a4eb0cdbd34beadc029f453b3bafae02401e999cf3d5a5af75f8", size = 434535301, upload-time = "2024-11-20T17:50:41.681Z" }, ] +[[package]] +name = "nvidia-cuda-cupti-cu12" +version = "12.4.127" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/93/b5/9fb3d00386d3361b03874246190dfec7b206fd74e6e287b26a8fcb359d95/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a", size = 12354556, upload-time = "2024-06-18T19:30:40.546Z" }, + { url = "https://files.pythonhosted.org/packages/67/42/f4f60238e8194a3106d06a058d494b18e006c10bb2b915655bd9f6ea4cb1/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb", size = 13813957, upload-time = "2024-04-03T20:55:01.564Z" }, + { url = "https://files.pythonhosted.org/packages/f3/79/8cf313ec17c58ccebc965568e5bcb265cdab0a1df99c4e674bb7a3b99bfe/nvidia_cuda_cupti_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:5688d203301ab051449a2b1cb6690fbe90d2b372f411521c86018b950f3d7922", size = 9938035, upload-time = "2024-04-03T21:01:01.109Z" }, +] + [[package]] name = "nvidia-cuda-cupti-cu12" version = "12.6.80" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] wheels = [ { url = "https://files.pythonhosted.org/packages/e6/8b/2f6230cb715646c3a9425636e513227ce5c93c4d65823a734f4bb86d43c3/nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:166ee35a3ff1587f2490364f90eeeb8da06cd867bd5b701bf7f9a02b78bc63fc", size = 8236764, upload-time = "2024-11-20T17:35:41.03Z" }, { url = "https://files.pythonhosted.org/packages/25/0f/acb326ac8fd26e13c799e0b4f3b2751543e1834f04d62e729485872198d4/nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_aarch64.whl", hash = "sha256:358b4a1d35370353d52e12f0a7d1769fc01ff74a191689d3870b2123156184c4", size = 8236756, upload-time = "2024-10-01T16:57:45.507Z" }, @@ -1273,20 +1305,52 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1c/81/7796f096afaf726796b1b648f3bc80cafc61fe7f77f44a483c89e6c5ef34/nvidia_cuda_cupti_cu12-12.6.80-py3-none-win_amd64.whl", hash = "sha256:bbe6ae76e83ce5251b56e8c8e61a964f757175682bbad058b170b136266ab00a", size = 5724175, upload-time = "2024-10-01T17:09:47.955Z" }, ] +[[package]] +name = "nvidia-cuda-nvrtc-cu12" +version = "12.4.127" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/77/aa/083b01c427e963ad0b314040565ea396f914349914c298556484f799e61b/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198", size = 24133372, upload-time = "2024-06-18T19:32:00.576Z" }, + { url = "https://files.pythonhosted.org/packages/2c/14/91ae57cd4db3f9ef7aa99f4019cfa8d54cb4caa7e00975df6467e9725a9f/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338", size = 24640306, upload-time = "2024-04-03T20:56:01.463Z" }, + { url = "https://files.pythonhosted.org/packages/7c/30/8c844bfb770f045bcd8b2c83455c5afb45983e1a8abf0c4e5297b481b6a5/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:a961b2f1d5f17b14867c619ceb99ef6fcec12e46612711bcec78eb05068a60ec", size = 19751955, upload-time = "2024-04-03T21:01:51.133Z" }, +] + [[package]] name = "nvidia-cuda-nvrtc-cu12" version = "12.6.77" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] wheels = [ { url = "https://files.pythonhosted.org/packages/f4/2f/72df534873235983cc0a5371c3661bebef7c4682760c275590b972c7b0f9/nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5847f1d6e5b757f1d2b3991a01082a44aad6f10ab3c5c0213fa3e25bddc25a13", size = 23162955, upload-time = "2024-10-01T16:59:50.922Z" }, { url = "https://files.pythonhosted.org/packages/75/2e/46030320b5a80661e88039f59060d1790298b4718944a65a7f2aeda3d9e9/nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:35b0cc6ee3a9636d5409133e79273ce1f3fd087abb0532d2d2e8fff1fe9efc53", size = 23650380, upload-time = "2024-10-01T17:00:14.643Z" }, { url = "https://files.pythonhosted.org/packages/f5/46/d3a1cdda8bb113c80f43a0a6f3a853356d487b830f3483f92d49ce87fa55/nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-win_amd64.whl", hash = "sha256:f7007dbd914c56bd80ea31bc43e8e149da38f68158f423ba845fc3292684e45a", size = 39026742, upload-time = "2024-10-01T17:10:49.058Z" }, ] +[[package]] +name = "nvidia-cuda-runtime-cu12" +version = "12.4.127" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/a1/aa/b656d755f474e2084971e9a297def515938d56b466ab39624012070cb773/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3", size = 894177, upload-time = "2024-06-18T19:32:52.877Z" }, + { url = "https://files.pythonhosted.org/packages/ea/27/1795d86fe88ef397885f2e580ac37628ed058a92ed2c39dc8eac3adf0619/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5", size = 883737, upload-time = "2024-04-03T20:54:51.355Z" }, + { url = "https://files.pythonhosted.org/packages/a8/8b/450e93fab75d85a69b50ea2d5fdd4ff44541e0138db16f9cd90123ef4de4/nvidia_cuda_runtime_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:09c2e35f48359752dfa822c09918211844a3d93c100a715d79b59591130c5e1e", size = 878808, upload-time = "2024-04-03T21:00:49.77Z" }, +] + [[package]] name = "nvidia-cuda-runtime-cu12" version = "12.6.77" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] wheels = [ { url = "https://files.pythonhosted.org/packages/8f/ea/590b2ac00d772a8abd1c387a92b46486d2679ca6622fd25c18ff76265663/nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6116fad3e049e04791c0256a9778c16237837c08b27ed8c8401e2e45de8d60cd", size = 908052, upload-time = "2024-11-20T17:35:19.905Z" }, { url = "https://files.pythonhosted.org/packages/b7/3d/159023799677126e20c8fd580cca09eeb28d5c5a624adc7f793b9aa8bbfa/nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d461264ecb429c84c8879a7153499ddc7b19b5f8d84c204307491989a365588e", size = 908040, upload-time = "2024-10-01T16:57:22.221Z" }, @@ -1295,12 +1359,30 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fa/76/4c80fa138333cc975743fd0687a745fccb30d167f906f13c1c7f9a85e5ea/nvidia_cuda_runtime_cu12-12.6.77-py3-none-win_amd64.whl", hash = "sha256:86c58044c824bf3c173c49a2dbc7a6c8b53cb4e4dca50068be0bf64e9dab3f7f", size = 891773, upload-time = "2024-10-01T17:09:26.362Z" }, ] +[[package]] +name = "nvidia-cudnn-cu12" +version = "9.1.0.70" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] +dependencies = [ + { name = "nvidia-cublas-cu12", version = "12.4.5.8", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/9f/fd/713452cd72343f682b1c7b9321e23829f00b842ceaedcda96e742ea0b0b3/nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f", size = 664752741, upload-time = "2024-04-22T15:24:15.253Z" }, + { url = "https://files.pythonhosted.org/packages/3f/d0/f90ee6956a628f9f04bf467932c0a25e5a7e706a684b896593c06c82f460/nvidia_cudnn_cu12-9.1.0.70-py3-none-win_amd64.whl", hash = "sha256:6278562929433d68365a07a4a1546c237ba2849852c0d4b2262a486e805b977a", size = 679925892, upload-time = "2024-04-22T15:24:53.333Z" }, +] + [[package]] name = "nvidia-cudnn-cu12" version = "9.5.1.17" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] dependencies = [ - { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cublas-cu12", version = "12.6.4.1", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/99/93/a201a12d3ec1caa8c6ac34c1c2f9eeb696b886f0c36ff23c638b46603bd0/nvidia_cudnn_cu12-9.5.1.17-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:9fd4584468533c61873e5fda8ca41bac3a38bcb2d12350830c69b0a96a7e4def", size = 570523509, upload-time = "2024-10-25T19:53:03.148Z" }, @@ -1308,12 +1390,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b6/b2/3f60d15f037fa5419d9d7f788b100ef33ea913ae5315c87ca6d6fa606c35/nvidia_cudnn_cu12-9.5.1.17-py3-none-win_amd64.whl", hash = "sha256:d7af0f8a4f3b4b9dbb3122f2ef553b45694ed9c384d5a75bab197b8eefb79ab8", size = 565440743, upload-time = "2024-10-25T19:55:49.74Z" }, ] +[[package]] +name = "nvidia-cufft-cu12" +version = "11.2.1.3" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] +dependencies = [ + { name = "nvidia-nvjitlink-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/7a/8a/0e728f749baca3fbeffad762738276e5df60851958be7783af121a7221e7/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399", size = 211422548, upload-time = "2024-06-18T19:33:39.396Z" }, + { url = "https://files.pythonhosted.org/packages/27/94/3266821f65b92b3138631e9c8e7fe1fb513804ac934485a8d05776e1dd43/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9", size = 211459117, upload-time = "2024-04-03T20:57:40.402Z" }, + { url = "https://files.pythonhosted.org/packages/f6/ee/3f3f8e9874f0be5bbba8fb4b62b3de050156d159f8b6edc42d6f1074113b/nvidia_cufft_cu12-11.2.1.3-py3-none-win_amd64.whl", hash = "sha256:d802f4954291101186078ccbe22fc285a902136f974d369540fd4a5333d1440b", size = 210576476, upload-time = "2024-04-03T21:04:06.422Z" }, +] + [[package]] name = "nvidia-cufft-cu12" version = "11.3.0.4" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] dependencies = [ - { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", version = "12.6.85", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/1f/37/c50d2b2f2c07e146776389e3080f4faf70bcc4fa6e19d65bb54ca174ebc3/nvidia_cufft_cu12-11.3.0.4-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d16079550df460376455cba121db6564089176d9bac9e4f360493ca4741b22a6", size = 200164144, upload-time = "2024-11-20T17:40:58.288Z" }, @@ -1323,10 +1424,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b4/38/36fd800cec8f6e89b7c1576edaaf8076e69ec631644cdbc1b5f2e2b5a9df/nvidia_cufft_cu12-11.3.0.4-py3-none-win_amd64.whl", hash = "sha256:6048ebddfb90d09d2707efb1fd78d4e3a77cb3ae4dc60e19aab6be0ece2ae464", size = 199356881, upload-time = "2024-10-01T17:13:01.861Z" }, ] +[[package]] +name = "nvidia-curand-cu12" +version = "10.3.5.147" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/80/9c/a79180e4d70995fdf030c6946991d0171555c6edf95c265c6b2bf7011112/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9", size = 56314811, upload-time = "2024-06-18T19:34:48.575Z" }, + { url = "https://files.pythonhosted.org/packages/8a/6d/44ad094874c6f1b9c654f8ed939590bdc408349f137f9b98a3a23ccec411/nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b", size = 56305206, upload-time = "2024-04-03T20:58:08.722Z" }, + { url = "https://files.pythonhosted.org/packages/1c/22/2573503d0d4e45673c263a313f79410e110eb562636b0617856fdb2ff5f6/nvidia_curand_cu12-10.3.5.147-py3-none-win_amd64.whl", hash = "sha256:f307cc191f96efe9e8f05a87096abc20d08845a841889ef78cb06924437f6771", size = 55799918, upload-time = "2024-04-03T21:04:34.45Z" }, +] + [[package]] name = "nvidia-curand-cu12" version = "10.3.7.77" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] wheels = [ { url = "https://files.pythonhosted.org/packages/42/ac/36543605358a355632f1a6faa3e2d5dfb91eab1e4bc7d552040e0383c335/nvidia_curand_cu12-10.3.7.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:6e82df077060ea28e37f48a3ec442a8f47690c7499bff392a5938614b56c98d8", size = 56289881, upload-time = "2024-10-01T17:04:18.981Z" }, { url = "https://files.pythonhosted.org/packages/73/1b/44a01c4e70933637c93e6e1a8063d1e998b50213a6b65ac5a9169c47e98e/nvidia_curand_cu12-10.3.7.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a42cd1344297f70b9e39a1e4f467a4e1c10f1da54ff7a85c12197f6c652c8bdf", size = 56279010, upload-time = "2024-11-20T17:42:50.958Z" }, @@ -1335,14 +1452,35 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a9/a8/0cd0cec757bd4b4b4ef150fca62ec064db7d08a291dced835a0be7d2c147/nvidia_curand_cu12-10.3.7.77-py3-none-win_amd64.whl", hash = "sha256:6d6d935ffba0f3d439b7cd968192ff068fafd9018dbf1b85b37261b13cfc9905", size = 55783873, upload-time = "2024-10-01T17:13:30.377Z" }, ] +[[package]] +name = "nvidia-cusolver-cu12" +version = "11.6.1.9" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] +dependencies = [ + { name = "nvidia-cublas-cu12", version = "12.4.5.8", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparse-cu12", version = "12.3.1.170", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/46/6b/a5c33cf16af09166845345275c34ad2190944bcc6026797a39f8e0a282e0/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e", size = 127634111, upload-time = "2024-06-18T19:35:01.793Z" }, + { url = "https://files.pythonhosted.org/packages/3a/e1/5b9089a4b2a4790dfdea8b3a006052cfecff58139d5a4e34cb1a51df8d6f/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260", size = 127936057, upload-time = "2024-04-03T20:58:28.735Z" }, + { url = "https://files.pythonhosted.org/packages/f2/be/d435b7b020e854d5d5a682eb5de4328fd62f6182507406f2818280e206e2/nvidia_cusolver_cu12-11.6.1.9-py3-none-win_amd64.whl", hash = "sha256:e77314c9d7b694fcebc84f58989f3aa4fb4cb442f12ca1a9bde50f5e8f6d1b9c", size = 125224015, upload-time = "2024-04-03T21:04:53.339Z" }, +] + [[package]] name = "nvidia-cusolver-cu12" version = "11.7.1.2" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] dependencies = [ - { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cublas-cu12", version = "12.6.4.1", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparse-cu12", version = "12.5.4.2", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", version = "12.6.85", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/93/17/dbe1aa865e4fdc7b6d4d0dd308fdd5aaab60f939abfc0ea1954eac4fb113/nvidia_cusolver_cu12-11.7.1.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0ce237ef60acde1efc457335a2ddadfd7610b892d94efee7b776c64bb1cac9e0", size = 157833628, upload-time = "2024-10-01T17:05:05.591Z" }, @@ -1352,12 +1490,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d4/53/fff50a0808df7113d77e3bbc7c2b7eaed6f57d5eb80fbe93ead2aea1e09a/nvidia_cusolver_cu12-11.7.1.2-py3-none-win_amd64.whl", hash = "sha256:6813f9d8073f555444a8705f3ab0296d3e1cb37a16d694c5fc8b862a0d8706d7", size = 149287877, upload-time = "2024-10-01T17:13:49.804Z" }, ] +[[package]] +name = "nvidia-cusparse-cu12" +version = "12.3.1.170" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] +dependencies = [ + { name = "nvidia-nvjitlink-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/96/a9/c0d2f83a53d40a4a41be14cea6a0bf9e668ffcf8b004bd65633f433050c0/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3", size = 207381987, upload-time = "2024-06-18T19:35:32.989Z" }, + { url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763, upload-time = "2024-04-03T20:58:59.995Z" }, + { url = "https://files.pythonhosted.org/packages/a2/e0/3155ca539760a8118ec94cc279b34293309bcd14011fc724f87f31988843/nvidia_cusparse_cu12-12.3.1.170-py3-none-win_amd64.whl", hash = "sha256:9bc90fb087bc7b4c15641521f31c0371e9a612fc2ba12c338d3ae032e6b6797f", size = 204684315, upload-time = "2024-04-03T21:05:26.031Z" }, +] + [[package]] name = "nvidia-cusparse-cu12" version = "12.5.4.2" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] dependencies = [ - { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", version = "12.6.85", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/eb/eb/6681efd0aa7df96b4f8067b3ce7246833dd36830bb4cec8896182773db7d/nvidia_cusparse_cu12-12.5.4.2-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d25b62fb18751758fe3c93a4a08eff08effedfe4edf1c6bb5afd0890fe88f887", size = 216451147, upload-time = "2024-11-20T17:44:18.055Z" }, @@ -1367,10 +1524,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/45/ef/876ad8e4260e1128e6d4aac803d9d51baf3791ebdb4a9b8d9b8db032b4b0/nvidia_cusparse_cu12-12.5.4.2-py3-none-win_amd64.whl", hash = "sha256:4acb8c08855a26d737398cba8fb6f8f5045d93f82612b4cfd84645a2332ccf20", size = 213712630, upload-time = "2024-10-01T17:14:23.779Z" }, ] +[[package]] +name = "nvidia-cusparselt-cu12" +version = "0.6.2" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/98/8e/675498726c605c9441cf46653bd29cb1b8666da1fb1469ffa25f67f20c58/nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_aarch64.whl", hash = "sha256:067a7f6d03ea0d4841c85f0c6f1991c5dda98211f6302cb83a4ab234ee95bef8", size = 149422781, upload-time = "2024-07-23T17:35:27.203Z" }, + { url = "https://files.pythonhosted.org/packages/78/a8/bcbb63b53a4b1234feeafb65544ee55495e1bb37ec31b999b963cbccfd1d/nvidia_cusparselt_cu12-0.6.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:df2c24502fd76ebafe7457dbc4716b2fec071aabaed4fb7691a201cde03704d9", size = 150057751, upload-time = "2024-07-23T02:35:53.074Z" }, + { url = "https://files.pythonhosted.org/packages/56/8f/2c33082238b6c5e783a877dc8786ab62619e3e6171c083bd3bba6e3fe75e/nvidia_cusparselt_cu12-0.6.2-py3-none-win_amd64.whl", hash = "sha256:0057c91d230703924c0422feabe4ce768841f9b4b44d28586b6f6d2eb86fbe70", size = 148755794, upload-time = "2024-07-23T02:35:00.261Z" }, +] + [[package]] name = "nvidia-cusparselt-cu12" version = "0.6.3" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] wheels = [ { url = "https://files.pythonhosted.org/packages/62/da/4de092c61c6dea1fc9c936e69308a02531d122e12f1f649825934ad651b5/nvidia_cusparselt_cu12-0.6.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:8371549623ba601a06322af2133c4a44350575f5a3108fb75f3ef20b822ad5f1", size = 156402859, upload-time = "2024-10-16T02:23:17.184Z" }, { url = "https://files.pythonhosted.org/packages/3b/9a/72ef35b399b0e183bc2e8f6f558036922d453c4d8237dab26c666a04244b/nvidia_cusparselt_cu12-0.6.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:e5c8a26c36445dd2e6812f1177978a24e2d37cacce7e090f297a688d1ec44f46", size = 156785796, upload-time = "2024-10-15T21:29:17.709Z" }, @@ -1394,20 +1567,52 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/df/99/12cd266d6233f47d00daf3a72739872bdc10267d0383508b0b9c84a18bb6/nvidia_nccl_cu12-2.21.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:8579076d30a8c24988834445f8d633c697d42397e92ffc3f63fa26766d25e0a0", size = 188654414, upload-time = "2024-04-03T15:32:57.427Z" }, ] +[[package]] +name = "nvidia-nvjitlink-cu12" +version = "12.4.127" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/02/45/239d52c05074898a80a900f49b1615d81c07fceadd5ad6c4f86a987c0bc4/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83", size = 20552510, upload-time = "2024-06-18T20:20:13.871Z" }, + { url = "https://files.pythonhosted.org/packages/ff/ff/847841bacfbefc97a00036e0fce5a0f086b640756dc38caea5e1bb002655/nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57", size = 21066810, upload-time = "2024-04-03T20:59:46.957Z" }, + { url = "https://files.pythonhosted.org/packages/81/19/0babc919031bee42620257b9a911c528f05fb2688520dcd9ca59159ffea8/nvidia_nvjitlink_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:fd9020c501d27d135f983c6d3e244b197a7ccad769e34df53a42e276b0e25fa1", size = 95336325, upload-time = "2024-04-03T21:06:25.073Z" }, +] + [[package]] name = "nvidia-nvjitlink-cu12" version = "12.6.85" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] wheels = [ { url = "https://files.pythonhosted.org/packages/9d/d7/c5383e47c7e9bf1c99d5bd2a8c935af2b6d705ad831a7ec5c97db4d82f4f/nvidia_nvjitlink_cu12-12.6.85-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:eedc36df9e88b682efe4309aa16b5b4e78c2407eac59e8c10a6a47535164369a", size = 19744971, upload-time = "2024-11-20T17:46:53.366Z" }, { url = "https://files.pythonhosted.org/packages/31/db/dc71113d441f208cdfe7ae10d4983884e13f464a6252450693365e166dcf/nvidia_nvjitlink_cu12-12.6.85-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cf4eaa7d4b6b543ffd69d6abfb11efdeb2db48270d94dfd3a452c24150829e41", size = 19270338, upload-time = "2024-11-20T17:46:29.758Z" }, { url = "https://files.pythonhosted.org/packages/89/76/93c1467b1387387440a4d25102d86b7794535449b689f8e2dc22c1c8ff7f/nvidia_nvjitlink_cu12-12.6.85-py3-none-win_amd64.whl", hash = "sha256:e61120e52ed675747825cdd16febc6a0730537451d867ee58bee3853b1b13d1c", size = 161908572, upload-time = "2024-11-20T17:52:40.124Z" }, ] +[[package]] +name = "nvidia-nvtx-cu12" +version = "12.4.127" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/06/39/471f581edbb7804b39e8063d92fc8305bdc7a80ae5c07dbe6ea5c50d14a5/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3", size = 100417, upload-time = "2024-06-18T20:16:22.484Z" }, + { url = "https://files.pythonhosted.org/packages/87/20/199b8713428322a2f22b722c62b8cc278cc53dffa9705d744484b5035ee9/nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a", size = 99144, upload-time = "2024-04-03T20:56:12.406Z" }, + { url = "https://files.pythonhosted.org/packages/54/1b/f77674fbb73af98843be25803bbd3b9a4f0a96c75b8d33a2854a5c7d2d77/nvidia_nvtx_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:641dccaaa1139f3ffb0d3164b4b84f9d253397e38246a4f2f36728b48566d485", size = 66307, upload-time = "2024-04-03T21:02:01.959Z" }, +] + [[package]] name = "nvidia-nvtx-cu12" version = "12.6.77" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", +] wheels = [ { url = "https://files.pythonhosted.org/packages/b9/93/80f8a520375af9d7ee44571a6544653a176e53c2b8ccce85b97b83c2491b/nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f44f8d86bb7d5629988d61c8d3ae61dddb2015dee142740536bc7481b022fe4b", size = 90549, upload-time = "2024-11-20T17:38:17.387Z" }, { url = "https://files.pythonhosted.org/packages/2b/53/36e2fd6c7068997169b49ffc8c12d5af5e5ff209df6e1a2c4d373b3a638f/nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_aarch64.whl", hash = "sha256:adcaabb9d436c9761fca2b13959a2d237c5f9fd406c8e4b723c695409ff88059", size = 90539, upload-time = "2024-10-01T17:00:27.179Z" }, @@ -2078,20 +2283,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e6/eb/3bf6ea8ab7f1503dca3a10df2e4b9c3f6b3316df07f6c0ded94b281c7101/scipy-1.15.3-cp312-cp312-win_amd64.whl", hash = "sha256:52092bc0472cfd17df49ff17e70624345efece4e1a12b23783a1ac59a1b728ed", size = 40966184, upload-time = "2025-05-08T16:06:52.623Z" }, ] -[[package]] -name = "seaborn" -version = "0.13.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "matplotlib", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" }, - { name = "numpy", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" }, - { name = "pandas", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/86/59/a451d7420a77ab0b98f7affa3a1d78a313d2f7281a57afb1a34bae8ab412/seaborn-0.13.2.tar.gz", hash = "sha256:93e60a40988f4d65e9f4885df477e2fdaff6b73a9ded434c1ab356dd57eefff7", size = 1457696, upload-time = "2024-01-25T13:21:52.551Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/83/11/00d3c3dfc25ad54e731d91449895a79e4bf2384dc3ac01809010ba88f6d5/seaborn-0.13.2-py3-none-any.whl", hash = "sha256:636f8336facf092165e27924f223d3c62ca560b1f2bb5dff7ab7fad265361987", size = 294914, upload-time = "2024-01-25T13:21:49.598Z" }, -] - [[package]] name = "semantic-version" version = "2.10.0" @@ -2235,8 +2426,8 @@ wheels = [ [[package]] name = "torch" -version = "2.6.0+cpu" -source = { registry = "https://download.pytorch.org/whl/cpu" } +version = "2.6.0" +source = { registry = "https://pypi.org/simple" } resolution-markers = [ "platform_machine == 'aarch64' and sys_platform == 'linux'", "platform_machine == 'x86_64' and sys_platform == 'linux'", @@ -2246,14 +2437,29 @@ dependencies = [ { name = "fsspec", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "jinja2", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "networkx", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-cublas-cu12", version = "12.4.5.8", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-cupti-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-nvrtc-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-runtime-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cudnn-cu12", version = "9.1.0.70", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cufft-cu12", version = "11.2.1.3", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-curand-cu12", version = "10.3.5.147", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusolver-cu12", version = "11.6.1.9", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparse-cu12", version = "12.3.1.170", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparselt-cu12", version = "0.6.2", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvtx-cu12", version = "12.4.127", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "setuptools", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "sympy", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "typing-extensions", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp312-cp312-linux_x86_64.whl", hash = "sha256:59e78aa0c690f70734e42670036d6b541930b8eabbaa18d94e090abf14cc4d91" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:318290e8924353c61b125cdc8768d15208704e279e7757c113b9620740deca98" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp312-cp312-win_amd64.whl", hash = "sha256:4027d982eb2781c93825ab9527f17fbbb12dbabf422298e4b954be60016f87d8" }, + { url = "https://files.pythonhosted.org/packages/e5/35/0c52d708144c2deb595cd22819a609f78fdd699b95ff6f0ebcd456e3c7c1/torch-2.6.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:2bb8987f3bb1ef2675897034402373ddfc8f5ef0e156e2d8cfc47cacafdda4a9", size = 766624563, upload-time = "2025-01-29T16:23:19.084Z" }, + { url = "https://files.pythonhosted.org/packages/01/d6/455ab3fbb2c61c71c8842753b566012e1ed111e7a4c82e0e1c20d0c76b62/torch-2.6.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:b789069020c5588c70d5c2158ac0aa23fd24a028f34a8b4fcb8fcb4d7efcf5fb", size = 95607867, upload-time = "2025-01-29T16:25:55.649Z" }, + { url = "https://files.pythonhosted.org/packages/18/cf/ae99bd066571656185be0d88ee70abc58467b76f2f7c8bfeb48735a71fe6/torch-2.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:7e1448426d0ba3620408218b50aa6ada88aeae34f7a239ba5431f6c8774b1239", size = 204120469, upload-time = "2025-01-29T16:24:01.821Z" }, + { url = "https://files.pythonhosted.org/packages/81/b4/605ae4173aa37fb5aa14605d100ff31f4f5d49f617928c9f486bb3aaec08/torch-2.6.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:9a610afe216a85a8b9bc9f8365ed561535c93e804c2a317ef7fabcc5deda0989", size = 66532538, upload-time = "2025-01-29T16:24:18.976Z" }, ] [[package]] @@ -2302,19 +2508,19 @@ dependencies = [ { name = "fsspec", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "jinja2", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "networkx", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cublas-cu12", version = "12.6.4.1", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-cupti-cu12", version = "12.6.80", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-nvrtc-cu12", version = "12.6.77", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cuda-runtime-cu12", version = "12.6.77", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cudnn-cu12", version = "9.5.1.17", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cufft-cu12", version = "11.3.0.4", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-curand-cu12", version = "10.3.7.77", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusolver-cu12", version = "11.7.1.2", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparse-cu12", version = "12.5.4.2", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-cusparselt-cu12", version = "0.6.3", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, - { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvjitlink-cu12", version = "12.6.85", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, + { name = "nvidia-nvtx-cu12", version = "12.6.77", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "setuptools", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "sympy", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, @@ -2481,7 +2687,6 @@ dependencies = [ { name = "polars", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" }, { name = "psutil", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" }, { name = "pynvml", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" }, - { name = "seaborn", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" }, { name = "tqdm", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" }, { name = "weathergen-common", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" }, { name = "weathergen-evaluate", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" }, @@ -2491,7 +2696,7 @@ dependencies = [ [package.optional-dependencies] cpu = [ - { name = "torch", version = "2.6.0+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "torch", version = "2.6.0", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ] gpu = [ { name = "flash-attn", version = "2.7.3", source = { url = "https://object-store.os-api.cci1.ecmwf.int/weathergenerator-dev/wheels/flash_attn-2.7.3-cp312-cp312-linux_aarch64.whl" }, marker = "(platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'extra-10-weathergen-gpu') or (platform_machine != 'aarch64' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu') or (sys_platform != 'linux' and extra == 'extra-10-weathergen-cpu' and extra == 'extra-10-weathergen-gpu')" }, @@ -2530,12 +2735,11 @@ requires-dist = [ { name = "polars", specifier = "~=1.25.2" }, { name = "psutil" }, { name = "pynvml" }, - { name = "seaborn", specifier = ">=0.13.2" }, { name = "torch", marker = "platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'gpu'", url = "https://download.pytorch.org/whl/cu126/torch-2.6.0%2Bcu126-cp312-cp312-linux_aarch64.whl" }, { name = "torch", marker = "platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'gpu'", url = "https://download.pytorch.org/whl/cu126/torch-2.6.0%2Bcu126-cp312-cp312-manylinux_2_28_x86_64.whl" }, - { name = "torch", marker = "sys_platform == 'linux' and extra == 'cpu'", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cpu", conflict = { package = "weathergen", extra = "cpu" } }, - { name = "torch", marker = "sys_platform != 'linux' and extra == 'cpu'", specifier = "==2.6.0" }, { name = "torch", marker = "(platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'gpu') or (sys_platform != 'linux' and extra == 'gpu')", specifier = "==2.6.0+cu126" }, + { name = "torch", marker = "sys_platform == 'macosx' and extra == 'cpu'", specifier = "==2.6.0", index = "https://download.pytorch.org/whl/cpu", conflict = { package = "weathergen", extra = "cpu" } }, + { name = "torch", marker = "sys_platform != 'macosx' and extra == 'cpu'", specifier = "==2.6.0" }, { name = "tqdm" }, { name = "weathergen-common", editable = "packages/common" }, { name = "weathergen-evaluate", editable = "packages/evaluate" }, From da92f8fd67fabaee8de479893b61d4a43044b5f3 Mon Sep 17 00:00:00 2001 From: Julian Kuehnert Date: Mon, 13 Oct 2025 13:26:31 +0000 Subject: [PATCH 11/19] revert to default confit --- config/default_config.yml | 30 ++++++++++++++---------------- config/streams/era5_1deg/era5.yml | 2 +- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/config/default_config.yml b/config/default_config.yml index d67d5359e..5ab614cf7 100644 --- a/config/default_config.yml +++ b/config/default_config.yml @@ -10,7 +10,7 @@ embed_dropout_rate: 0.1 target_cell_local_prediction: True ae_local_dim_embed: 1024 -ae_local_num_blocks: 0 +ae_local_num_blocks: 2 ae_local_num_heads: 16 ae_local_dropout_rate: 0.1 ae_local_with_qk_lnorm: True @@ -24,7 +24,7 @@ ae_adapter_with_residual: True ae_adapter_dropout_rate: 0.1 ae_global_dim_embed: 2048 -ae_global_num_blocks: 4 +ae_global_num_blocks: 8 ae_global_num_heads: 32 ae_global_dropout_rate: 0.1 ae_global_with_qk_lnorm: True @@ -36,19 +36,18 @@ ae_global_mlp_hidden_factor: 2 decoder_type: PerceiverIOCoordConditioning # CrossAttentionAdaNormConditioning pred_adapter_kv: False -pred_self_attention: False +pred_self_attention: True pred_dyadic_dims: False pred_mlp_adaln: True # number of steps offset applied to first target window; if set to zero and forecast_steps=0 then # one is training an auto-encoder -forecast_offset : 1 +forecast_offset : 0 forecast_delta_hrs: 0 -forecast_steps: 2 -forecast_policy: "fixed" -forecast_freeze_model: False +forecast_steps: 0 +forecast_policy: null forecast_att_dense_rate: 1.0 -fe_num_blocks: 8 +fe_num_blocks: 0 fe_num_heads: 16 fe_dropout_rate: 0.1 fe_with_qk_lnorm: True @@ -88,7 +87,7 @@ freeze_modules: "" # training mode: "forecast" or "masking" (masked token modeling) # for "masking" to train with auto-encoder mode, forecast_offset should be 0 -training_mode: "forecast" +training_mode: "masking" # masking rate when training mode is "masking"; ignored in foreacast mode masking_rate: 0.6 # sample the masking rate (with normal distribution centered at masking_rate) @@ -96,7 +95,7 @@ masking_rate: 0.6 masking_rate_sampling: True # sample a subset of all target points, useful e.g. to reduce memory requirements (also can specify per-stream) sampling_rate_target: 1.0 -# include a masking strategy here, currently only supporting "random", "block", "healpix", "channel", "combination" +# include a masking strategy here, currently only supporting "random", "block", "healpix", "channel", "causal" and "combination" masking_strategy: "random" # masking_strategy_config is a dictionary of additional parameters for the masking strategy # required for "healpix" and "channel" masking strategies @@ -108,17 +107,17 @@ masking_strategy_config: {"strategies": ["random", "healpix", "channel"], "same_strategy_per_batch": false } -num_epochs: 64 +num_epochs: 32 samples_per_epoch: 4096 samples_per_validation: 512 shuffle: True lr_scaling_policy: "sqrt" lr_start: 1e-6 -lr_max: 0.0001 -lr_final_decay: 2e-6 +lr_max: 5e-5 +lr_final_decay: 1e-6 lr_final: 0.0 -lr_steps_warmup: 256 +lr_steps_warmup: 512 lr_steps_cooldown: 512 lr_policy_warmup: "cosine" lr_policy_decay: "linear" @@ -128,7 +127,6 @@ grad_clip: 1.0 weight_decay: 0.1 norm_type: "LayerNorm" nn_module: "te" -log_grad_norms: True start_date: 197901010000 end_date: 202012310000 @@ -154,4 +152,4 @@ run_id: ??? # Parameters for logging/printing in the training loop train_log: # The period to log metrics (in number of batch steps) - log_interval: 20 \ No newline at end of file + log_interval: 20 diff --git a/config/streams/era5_1deg/era5.yml b/config/streams/era5_1deg/era5.yml index 85ac8a8ca..bb2234c4e 100644 --- a/config/streams/era5_1deg/era5.yml +++ b/config/streams/era5_1deg/era5.yml @@ -29,7 +29,7 @@ ERA5 : dim_embed : 256 target_readout : type : 'obs_value' # token or obs_value - num_layers : 1 + num_layers : 2 num_heads : 4 # sampling_rate : 0.2 pred_head : From a072c35973ccc445f852445355caf54d92c7dc47 Mon Sep 17 00:00:00 2001 From: Julian Kuehnert Date: Mon, 13 Oct 2025 13:54:43 +0000 Subject: [PATCH 12/19] add comment on FSDP2 specifics --- src/weathergen/train/trainer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py index 83515d317..cb29bf68e 100644 --- a/src/weathergen/train/trainer.py +++ b/src/weathergen/train/trainer.py @@ -926,9 +926,10 @@ def _log(self, stage: Stage): def _log_instant_grad_norms(self, stage: Stage, total_norm): """ Log instantaneous grad norms, we do not average because of the cost and because we want to - measure the actual values + measure the actual values. - TODO test DDP case + Note: When using FSDP2, we need full_tensor().item() instead of .item(), see here: + https://gist.github.com/Kai-46/a9835ef3f36e76d06afee6c11f388144 """ self.last_grad_norm = ( total_norm.full_tensor().item() if self.cf.world_size > 1 else total_norm.item() From c8fadf6a13b41c5fd7b65842af445fc6d2fc38be Mon Sep 17 00:00:00 2001 From: Jubeku Date: Thu, 16 Oct 2025 14:59:48 +0200 Subject: [PATCH 13/19] move plot grad script to private repo --- src/weathergen/utils/plot_grad_norms.py | 525 ------------------------ 1 file changed, 525 deletions(-) delete mode 100644 src/weathergen/utils/plot_grad_norms.py diff --git a/src/weathergen/utils/plot_grad_norms.py b/src/weathergen/utils/plot_grad_norms.py deleted file mode 100644 index ec310c0fc..000000000 --- a/src/weathergen/utils/plot_grad_norms.py +++ /dev/null @@ -1,525 +0,0 @@ -import json -import re -from pathlib import Path - -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd - -# ruff: noqa: T201 - - -class GradientNormsAnalyzer: - def __init__(self, json_file_path): - """ - Initialize the analyzer with path to JSON file containing gradient norms. - Expected format: one JSON object per line with step info and gradient norms. - """ - self.json_file_path = Path(json_file_path) - self.data = [] - self.df = None - self.load_data() - - def load_data(self): - """Load and parse the JSON data from file.""" - print(f"Loading data from {self.json_file_path}...") - - with open(self.json_file_path) as f: - for line_num, line in enumerate(f, 1): - try: - data_point = json.loads(line.strip()) - self.data.append(data_point) - except json.JSONDecodeError as e: - print(f"Warning: Could not parse line {line_num}: {e}") - - print(f"Loaded {len(self.data)} data points") - self.create_dataframe() - - def create_dataframe(self): - """Convert loaded data into a pandas DataFrame for easier analysis.""" - rows = [] - - for ith, entry in enumerate(self.data): - # step = entry.get('num_samples', entry.get('epoch', 0)) - step = ith * 5 - - # Handle different possible data structures - if "gradients" in entry: - grad_data = entry["gradients"] - elif "grad_norms" in entry: - grad_data = entry["grad_norms"] - else: - # Assume all keys except step/epoch are gradient data - grad_data = { - k: v for k, v in entry.items() if "stream" not in k and ("grad_norm" in k) - } - - for param_name, norm_value in grad_data.items(): - rows.append( - { - "num_samples": step, - "parameter": param_name, - "grad_norm": float(norm_value), - "layer_type": self.extract_layer_type(param_name), - "layer_depth": self.extract_layer_depth(param_name), - } - ) - - self.df = pd.DataFrame(rows) - print(f"Created DataFrame with {len(self.df)} gradient norm records") - - def extract_layer_type(self, param_name): - """Extract layer type from parameter name.""" - param_name_lower = param_name.lower()[10:] - - # Handle your specific naming patterns - if param_name_lower.startswith("embeds."): - if ".embed." in param_name_lower: - return "embedding" - elif ".unembed." in param_name_lower: - return "unembedding" - elif ".ln_final." in param_name_lower: - return "layer_norm_final" - elif "proj_heads_q" in param_name_lower: - return "attention_q" - elif "proj_heads_k" in param_name_lower: - return "attention_k" - elif "proj_heads_v" in param_name_lower: - return "attention_v" - elif "proj_out" in param_name_lower: - return "attention_out" - elif ".layers." in param_name_lower and ( - "weight" in param_name_lower or "bias" in param_name_lower - ): - return "ffn" - else: - return "embeds_other" - - elif param_name_lower.startswith("ae_local_blocks."): - if "proj_heads_q" in param_name_lower: - return "ae_local_attention_q" - elif "proj_heads_k" in param_name_lower: - return "ae_local_attention_k" - elif "proj_heads_v" in param_name_lower: - return "ae_local_attention_v" - elif "proj_out" in param_name_lower: - return "ae_local_attention_out" - elif ".layers." in param_name_lower: - return "ae_local_ffn" - else: - return "ae_local_other" - - elif param_name_lower.startswith("ae_global_blocks."): - if "proj_heads_q" in param_name_lower: - return "ae_global_attention_q" - elif "proj_heads_k" in param_name_lower: - return "ae_global_attention_k" - elif "proj_heads_v" in param_name_lower: - return "ae_global_attention_v" - elif "proj_out" in param_name_lower: - return "ae_global_attention_out" - elif ".layers." in param_name_lower: - return "ae_global_ffn" - else: - return "ae_global_other" - - elif param_name_lower.startswith("ae_adapter."): - if "proj_heads_q" in param_name_lower: - return "ae_adapter_attention_q" - elif "proj_heads_k" in param_name_lower: - return "ae_adapter_attention_k" - elif "proj_heads_v" in param_name_lower: - return "ae_adapter_attention_v" - elif "proj_out" in param_name_lower: - return "ae_adapter_attention_out" - elif ".layers." in param_name_lower: - return "ae_adapter_ffn" - else: - return "ae_adapter_other" - - elif param_name_lower.startswith("target_token_engines."): - if "proj_heads_q" in param_name_lower: - return "tte_attention_q" - elif "proj_heads_k" in param_name_lower: - return "tte_attention_k" - elif "proj_heads_v" in param_name_lower: - return "tte_attention_v" - elif "proj_out" in param_name_lower: - return "tte_attention_out" - elif "embed_aux" in param_name_lower: - return "tte_embed_aux" - elif "lnorm" in param_name_lower: - return "tte_layer_norm" - elif ".layers." in param_name_lower: - return "tte_ffn" - else: - return "tte_other" - - elif param_name_lower.startswith("embed_target_coords."): - return "target_coords_embedding" - - elif param_name_lower.startswith("pred_heads."): - return "prediction_head" - - # Fallback for standard patterns (if any) - elif "embed" in param_name_lower: - return "embedding" - elif "attention" in param_name_lower or "attn" in param_name_lower: - if "q_proj" in param_name_lower or "query" in param_name_lower: - return "attention_q" - elif "k_proj" in param_name_lower or "key" in param_name_lower: - return "attention_k" - elif "v_proj" in param_name_lower or "value" in param_name_lower: - return "attention_v" - elif "o_proj" in param_name_lower or "out" in param_name_lower: - return "attention_out" - else: - return "attention" - elif ( - "layernorm" in param_name_lower - or "layer_norm" in param_name_lower - or "ln" in param_name_lower - ): - return "layernorm" - else: - return "other" - - def extract_layer_depth(self, param_name): - """Extract layer depth/index from parameter name.""" - param_name_lower = param_name.lower() - - # Look for patterns specific to your architecture - patterns = [ - # embeds.0.layers.N.* (transformer layers within embeds) - r"grad_norm_embeds\.\d+\.layers\.(\d+)\.", - # embeds.0.unembed.N.* (unembedding layers) - r"grad_norm_embeds\.\d+\.unembed\.(\d+)\.", - # embeds.0.ln_final.N.* (final layer norms) - r"grad_norm_embeds\.\d+\.ln_final\.(\d+)\.", - # ae_local_blocks.N.* (autoencoder local blocks) - r"grad_norm_ae_local_blocks\.(\d+)\.", - # ae_global_blocks.N.* (autoencoder global blocks) - r"ae_global_blocks\.(\d+)\.", - # ae_adapter.N.* (autoencoder adapter blocks) - r"ae_adapter\.(\d+)\.", - # target_token_engines.0.tte.N.* (target token engine blocks) - r"target_token_engines\.\d+\.tte\.(\d+)\.", - # target_token_engines.0.tte.N.block.M.* (nested blocks) - r"target_token_engines\.\d+\.tte\.(\d+)\.block\.(\d+)\.", - # pred_heads.0.pred_heads.0.N.* (prediction head layers) - r"pred_heads\.\d+\.pred_heads\.\d+\.(\d+)\.", - # Generic patterns for any numbered layers - r"layer[s]?\.(\d+)", - r"h\.(\d+)", - r"transformer\.(\d+)", - r"blocks\.(\d+)", - ] - - for pattern in patterns: - match = re.search(pattern, param_name_lower) - if match: - # For nested patterns (like tte blocks), combine indices - if len(match.groups()) > 1: - # Combine indices: e.g., tte.1.block.2 -> 12 (or 1*10+2) - return int(match.group(1)) * 10 + int(match.group(2)) - else: - return int(match.group(1)) - - # Special handling for components without clear depth - if param_name_lower.startswith("embed_target_coords."): - return 0 # Coordinate embeddings at the start - elif "total_grad_norm" in param_name_lower: - return -2 # Special marker for total norm - elif any(x in param_name_lower for x in ["weathergen", "stage", "q_cells"]): - return -3 # Special marker for metadata - - return -1 # Unknown depth - - def plot_total_gradient_norms(self, figsize=(12, 6)): - """Plot total gradient norm over training steps.""" - # Calculate total norm per step - total_norms = [] - steps = [] - - for ith, entry in enumerate(self.data): - # step = entry.get('num_samples', entry.get('epoch', 0)) - step = ith * 5 - - if "gradients" in entry: - grad_data = entry["gradients"] - elif "grad_norms" in entry: - grad_data = entry["grad_norms"] - else: - grad_data = {k: v for k, v in entry.items() if "grad_norm" in k} - - if len(grad_data) == 0: - continue - - # Calculate total norm (L2 norm of all gradients) - total_norm = np.sqrt(sum(float(v) ** 2 for v in grad_data.values())) - total_norms.append(total_norm) - steps.append(step) - - plt.figure(figsize=figsize) - plt.plot(steps, total_norms, linewidth=1.5, alpha=0.8) - plt.xlabel("Training Step") - plt.ylabel("Total Gradient Norm") - plt.title("Total Gradient Norm vs Training Steps") - plt.yscale("log") - plt.grid(True, alpha=0.3) - plt.tight_layout() - plt.savefig("plots/total_grad_norm.png") - - return steps, total_norms - - def plot_layer_type_norms(self, figsize=(14, 8)): - """Plot gradient norms grouped by layer type.""" - if self.df is None: - print("No DataFrame available. Load data first.") - return - - plt.figure(figsize=figsize) - - # Get unique layer types - layer_types = self.df["layer_type"].unique() - print(layer_types) - colors = plt.cm.tab10(np.linspace(0, 1, len(layer_types))) - - for i, layer_type in enumerate(layer_types): - layer_data = self.df[self.df["layer_type"] == layer_type] - - # Calculate mean gradient norm per step for this layer type - mean_norms = layer_data.groupby("num_samples")["grad_norm"].mean() - - plt.plot( - mean_norms.index, mean_norms.values, label=layer_type, color=colors[i], alpha=0.8 - ) - - plt.xlabel("Training Step") - plt.ylabel("Mean Gradient Norm") - plt.title("Gradient Norms by Layer Type") - plt.yscale("log") - plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left") - plt.grid(True, alpha=0.3) - plt.tight_layout() - plt.savefig("plots/grad_norm_by_layer_type.png") - - def plot_layer_depth_analysis(self, figsize=(12, 8)): - """Plot gradient norms by layer depth.""" - if self.df is None: - print("No DataFrame available. Load data first.") - return - - # Filter out unknown depths - depth_data = self.df[self.df["layer_depth"] >= 0] - - if len(depth_data) == 0: - print("No layer depth information found in parameter names.") - return - - fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize) - - # Plot 1: Mean gradient norm by depth over time - depths = sorted(depth_data["layer_depth"].unique()) - colors = plt.cm.viridis(np.linspace(0, 1, len(depths))) - - for i, depth in enumerate(depths): - layer_data = depth_data[depth_data["layer_depth"] == depth] - mean_norms = layer_data.groupby("num_samples")["grad_norm"].mean() - - ax1.plot( - mean_norms.index, - mean_norms.values, - label=f"Layer {depth}", - color=colors[i], - alpha=0.8, - ) - - ax1.set_xlabel("Training Step") - ax1.set_ylabel("Mean Gradient Norm") - ax1.set_title("Gradient Norms by Layer Depth") - ax1.set_yscale("log") - ax1.legend(bbox_to_anchor=(1.05, 1), loc="upper left") - ax1.grid(True, alpha=0.3) - - # Plot 2: Heatmap of gradient norms by depth and step - pivot_data = ( - depth_data.groupby(["num_samples", "layer_depth"])["grad_norm"].mean().unstack() - ) - - # Sample data if too many steps for readability - if len(pivot_data) > 100: - sample_idx = np.linspace(0, len(pivot_data) - 1, 100, dtype=int) - pivot_data = pivot_data.iloc[sample_idx] - - im = ax2.imshow( - pivot_data.T, - aspect="auto", - cmap="viridis", - extent=[ - pivot_data.index.min(), - pivot_data.index.max(), - pivot_data.columns.min(), - pivot_data.columns.max(), - ], - ) - ax2.set_xlabel("Training Step") - ax2.set_ylabel("Layer Depth") - ax2.set_title("Gradient Norm Heatmap (Layer Depth vs Step)") - - cbar = plt.colorbar(im, ax=ax2) - cbar.set_label("Gradient Norm") - - plt.tight_layout() - plt.savefig("plots/grad_norm_heatmap.png") - - def plot_gradient_distribution(self, figsize=(15, 10)): - """Plot distribution of gradient norms.""" - if self.df is None: - print("No DataFrame available. Load data first.") - return - - fig, axes = plt.subplots(2, 2, figsize=figsize) - - # Plot 1: Histogram of all gradient norms - axes[0, 0].hist(np.log10(self.df["grad_norm"].values), bins=50, alpha=0.7) - axes[0, 0].set_xlabel("Log10(Gradient Norm)") - axes[0, 0].set_ylabel("Frequency") - axes[0, 0].set_title("Distribution of Gradient Norms (Log Scale)") - axes[0, 0].grid(True, alpha=0.3) - - # Plot 2: Box plot by layer type - layer_types = self.df["layer_type"].unique()[:10] # Limit to 10 for readability - plot_data = [ - np.log10(self.df[self.df["layer_type"] == lt]["grad_norm"].values) for lt in layer_types - ] - - axes[0, 1].boxplot(plot_data, labels=layer_types) - axes[0, 1].set_xlabel("Layer Type") - axes[0, 1].set_ylabel("Log10(Gradient Norm)") - axes[0, 1].set_title("Gradient Norm Distribution by Layer Type") - axes[0, 1].tick_params(axis="x", rotation=45) - axes[0, 1].grid(True, alpha=0.3) - - # Plot 3: Gradient norms over time (sample of parameters) - sample_params = self.df["parameter"].unique()[:20] # Sample 20 parameters - for param in sample_params: - param_data = self.df[self.df["parameter"] == param] - axes[1, 0].plot( - param_data["num_samples"], param_data["grad_norm"], alpha=0.6, linewidth=0.8 - ) - - axes[1, 0].set_xlabel("Training Step") - axes[1, 0].set_ylabel("Gradient Norm") - axes[1, 0].set_title("Individual Parameter Gradient Norms (Sample)") - axes[1, 0].set_yscale("log") - axes[1, 0].grid(True, alpha=0.3) - - # Plot 4: Statistics over time - stats_by_step = self.df.groupby("num_samples")["grad_norm"].agg( - ["mean", "std", "min", "max"] - ) - - axes[1, 1].fill_between( - stats_by_step.index, - stats_by_step["mean"] - stats_by_step["std"], - stats_by_step["mean"] + stats_by_step["std"], - alpha=0.3, - label="±1 std", - ) - axes[1, 1].plot(stats_by_step.index, stats_by_step["mean"], label="Mean", linewidth=2) - axes[1, 1].plot( - stats_by_step.index, stats_by_step["max"], label="Max", linewidth=1, alpha=0.8 - ) - axes[1, 1].plot( - stats_by_step.index, stats_by_step["min"], label="Min", linewidth=1, alpha=0.8 - ) - - axes[1, 1].set_xlabel("Training Step") - axes[1, 1].set_ylabel("Gradient Norm") - axes[1, 1].set_title("Gradient Norm Statistics Over Time") - axes[1, 1].set_yscale("log") - axes[1, 1].legend() - axes[1, 1].grid(True, alpha=0.3) - - plt.tight_layout() - plt.savefig("plots/grad_norm_over_time.png") - - def generate_summary_report(self): - """Generate a summary report of gradient norm statistics.""" - if self.df is None: - print("No DataFrame available. Load data first.") - return - - print("=== GRADIENT NORMS ANALYSIS REPORT ===") - print(f"Total data points: {len(self.df)}") - print(f"Training steps: {self.df['num_samples'].nunique()}") - print(f"Unique parameters: {self.df['parameter'].nunique()}") - print() - - print("Overall Statistics:") - print(f"Mean gradient norm: {self.df['grad_norm'].mean():.6f}") - print(f"Median gradient norm: {self.df['grad_norm'].median():.6f}") - print(f"Min gradient norm: {self.df['grad_norm'].min():.6f}") - print(f"Max gradient norm: {self.df['grad_norm'].max():.6f}") - print() - - print("Statistics by Layer Type:") - layer_stats = self.df.groupby("layer_type")["grad_norm"].agg( - ["count", "mean", "std", "min", "max"] - ) - print(layer_stats) - print() - - # Check for potential issues - print("Potential Issues:") - very_small = (self.df["grad_norm"] < 1e-6).sum() - very_large = (self.df["grad_norm"] > 10.0).sum() - - if very_small > 0: - print(f"⚠️ {very_small} gradient norms < 1e-6 (possible vanishing gradients)") - if very_large > 0: - print(f"⚠️ {very_large} gradient norms > 10.0 (possible exploding gradients)") - - if very_small == 0 and very_large == 0: - print("✅ No obvious gradient issues detected") - - -# Usage example -def analyze_gradient_file(json_file_path): - """ - Main function to analyze gradient norms from a JSON file. - - Usage: - analyze_gradient_file('gradient_norms.jsonl') - """ - - analyzer = GradientNormsAnalyzer(json_file_path) - - # Generate summary report - analyzer.generate_summary_report() - - # Create all plots - print("\n=== GENERATING PLOTS ===") - - print("1. Total gradient norms over time...") - analyzer.plot_total_gradient_norms() - - print("2. Gradient norms by layer type...") - analyzer.plot_layer_type_norms() - - print("3. Layer depth analysis...") - analyzer.plot_layer_depth_analysis() - - print("4. Gradient distribution analysis...") - analyzer.plot_gradient_distribution() - - return analyzer - - -# Example usage: -# uv run python src/weathergen/utils/plot_grad_norms.py results/yvhxm2jc/yvhxm2jc_train_metrics.json -if __name__ == "__main__": - import sys - - analyzer = analyze_gradient_file(sys.argv[1]) From 8bd73835c044e694d7ab068507513c75a3f12a11 Mon Sep 17 00:00:00 2001 From: Jubeku Date: Thu, 16 Oct 2025 15:15:11 +0200 Subject: [PATCH 14/19] rm seaborn from pyproject --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6a06230aa..80654ec01 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,6 @@ dependencies = [ "numexpr>=2.11.0", "weathergen-common", "weathergen-evaluate", - "seaborn>=0.13.2", ] From 9892dfaa76c4519f9814fbd94396f40115b39cdf Mon Sep 17 00:00:00 2001 From: Jubeku Date: Tue, 21 Oct 2025 14:26:26 +0200 Subject: [PATCH 15/19] updating terminal and metrics loggin, add get_tensor_item fct --- src/weathergen/model/model.py | 6 ++-- src/weathergen/train/trainer.py | 55 ++++++++++++++------------------- 2 files changed, 26 insertions(+), 35 deletions(-) diff --git a/src/weathergen/model/model.py b/src/weathergen/model/model.py index 803c0312b..18ec6537b 100644 --- a/src/weathergen/model/model.py +++ b/src/weathergen/model/model.py @@ -596,7 +596,7 @@ def forward(self, model_params: ModelParams, batch, forecast_offset: int, foreca if noise_std > 0.0: tokens = tokens + torch.randn_like(tokens) * torch.norm(tokens) * noise_std - tokens = self.forecast(model_params, tokens) + tokens = self.forecast(model_params, tokens, fstep) # prediction for final step preds_all += [ @@ -793,7 +793,7 @@ def assimilate_global(self, model_params: ModelParams, tokens: torch.Tensor) -> return tokens ######################################### - def forecast(self, model_params: ModelParams, tokens: torch.Tensor) -> torch.Tensor: + def forecast(self, model_params: ModelParams, tokens: torch.Tensor, fstep: int) -> torch.Tensor: """Advances latent space representation in time Args: @@ -806,7 +806,7 @@ def forecast(self, model_params: ModelParams, tokens: torch.Tensor) -> torch.Ten """ for it, block in enumerate(self.fe_blocks): - aux_info = torch.tensor([it], dtype=torch.float32, device="cuda") + aux_info = torch.tensor([fstep], dtype=torch.float32, device="cuda") tokens = checkpoint(block, tokens, aux_info, use_reentrant=False) return tokens diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py index f3980a96f..2abe9d921 100644 --- a/src/weathergen/train/trainer.py +++ b/src/weathergen/train/trainer.py @@ -556,7 +556,6 @@ def train(self, epoch): # Unweighted loss, real weighted loss, std for losses that need it self.loss_unweighted_hist, self.loss_model_hist, self.stdev_unweighted_hist = [], [], [] - self.last_grad_norm = 0.0 # training loop self.t_start = time.time() @@ -593,8 +592,11 @@ def train(self, epoch): ) # log gradient norms - if bidx % log_interval == 0 and self.log_grad_norms: - self._log_instant_grad_norms(TRAIN, total_norm) + if self.log_grad_norms: + if bidx % self.train_log_freq.terminal == 0: + self.last_grad_norm = self._get_tensor_item(total_norm) + if bidx % self.train_log_freq.metrics == 0: + self._log_instant_grad_norms(TRAIN, total_norm) # optimizer step self.grad_scaler.step(self.optimizer) @@ -980,31 +982,25 @@ def _log(self, stage: Stage): self.loss_unweighted_hist, self.loss_model_hist, self.stdev_unweighted_hist = [], [], [] - def _log_instant_grad_norms(self, stage: Stage, total_norm): + def _get_tensor_item(self, tensor): + """ + When using FSDP2, we need full_tensor().item() instead of .item(), see here: + https://gist.github.com/Kai-46/a9835ef3f36e76d06afee6c11f388144 + """ + return tensor.full_tensor().item() if self.cf.world_size > 1 else tensor.item() + + def _log_instant_grad_norms(self, stage: Stage): """ Log instantaneous grad norms, we do not average because of the cost and because we want to measure the actual values. - - Note: When using FSDP2, we need full_tensor().item() instead of .item(), see here: - https://gist.github.com/Kai-46/a9835ef3f36e76d06afee6c11f388144 """ - self.last_grad_norm = ( - total_norm.full_tensor().item() if self.cf.world_size > 1 else total_norm.item() - ) grad_norms = {"total_grad_norm": self.last_grad_norm} for name, param in self.model.named_parameters(): if param.grad is not None: - # grad_norms["grad_norm_" + name] = param.grad.norm().item() - grad_norms["grad_norm_" + name] = ( - param.grad.norm().full_tensor().item() - if self.cf.world_size > 1 - else param.grad.norm().item() - ) + grad_norms["grad_norm_" + name] = self._get_tensor_item(param.grad.norm()) - # print(".item():", param.grad.norm().item()) - # print(".full_tensor().item()", param.grad.norm().full_tensor().item()) if is_root(): - self.train_logger.log_metrics(TRAIN, grad_norms) + self.train_logger.log_metrics(stage, grad_norms) def _log_terminal(self, bidx: int, epoch: int, stage: Stage): print_freq = self.train_log_freq.terminal @@ -1027,21 +1023,16 @@ def _log_terminal(self, bidx: int, epoch: int, stage: Stage): elif stage == TRAIN: # samples per sec dt = time.time() - self.t_start - pstr = "{:03d} : {:05d}/{:05d} : {:06d} : loss = {:.4E} " - pstr += "(lr={:.2E}, gradient norm={:.3f}, s/sec={:.3f})" len_dataset = len(self.data_loader) // self.cf.batch_size_per_gpu - logger.info( - pstr.format( - epoch, - bidx, - len_dataset, - self.cf.istep, - avg_loss.nanmean().item(), - self.lr_scheduler.get_lr(), - self.last_grad_norm, - (print_freq * self.cf.batch_size_per_gpu) / dt, - ), + pstr = ( + f"{epoch:03d} : {bidx:05d}/{len_dataset:05d} : " + + f"{self.cf.istep:06d} : loss = {avg_loss.nanmean().item():.4E} " + + f"(lr={self.lr_scheduler.get_lr():.2E}, " ) + if self.log_grad_norms: + pstr += f"gradient norm={self.last_grad_norm:.3f}, " + pstr += f"s/sec={(print_freq * self.cf.batch_size_per_gpu) / dt:.3f})" + logger.info(pstr) logger.info("\t") for _, st in enumerate(self.cf.streams): logger.info( From 2885062432dd367c120335a1dd45392b4842a796 Mon Sep 17 00:00:00 2001 From: Jubeku Date: Tue, 21 Oct 2025 16:47:26 +0200 Subject: [PATCH 16/19] check for DTensor instead of world size --- src/weathergen/train/trainer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py index 2abe9d921..b8c189319 100644 --- a/src/weathergen/train/trainer.py +++ b/src/weathergen/train/trainer.py @@ -596,7 +596,7 @@ def train(self, epoch): if bidx % self.train_log_freq.terminal == 0: self.last_grad_norm = self._get_tensor_item(total_norm) if bidx % self.train_log_freq.metrics == 0: - self._log_instant_grad_norms(TRAIN, total_norm) + self._log_instant_grad_norms(TRAIN) # optimizer step self.grad_scaler.step(self.optimizer) @@ -984,10 +984,10 @@ def _log(self, stage: Stage): def _get_tensor_item(self, tensor): """ - When using FSDP2, we need full_tensor().item() instead of .item(), see here: - https://gist.github.com/Kai-46/a9835ef3f36e76d06afee6c11f388144 + When using FSDP2, tensor is a DTensor and we need full_tensor().item() instead of .item(), + see here: https://gist.github.com/Kai-46/a9835ef3f36e76d06afee6c11f388144 """ - return tensor.full_tensor().item() if self.cf.world_size > 1 else tensor.item() + return tensor.full_tensor().item() if isinstance(tensor, DTensor) else tensor.item() def _log_instant_grad_norms(self, stage: Stage): """ From cbb1c85e88fb8c2ec2d15b768bfedf12e95396e8 Mon Sep 17 00:00:00 2001 From: Jubeku Date: Tue, 21 Oct 2025 17:33:40 +0200 Subject: [PATCH 17/19] revert forecast fct, fix in separate PR --- src/weathergen/model/model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/weathergen/model/model.py b/src/weathergen/model/model.py index 18ec6537b..803c0312b 100644 --- a/src/weathergen/model/model.py +++ b/src/weathergen/model/model.py @@ -596,7 +596,7 @@ def forward(self, model_params: ModelParams, batch, forecast_offset: int, foreca if noise_std > 0.0: tokens = tokens + torch.randn_like(tokens) * torch.norm(tokens) * noise_std - tokens = self.forecast(model_params, tokens, fstep) + tokens = self.forecast(model_params, tokens) # prediction for final step preds_all += [ @@ -793,7 +793,7 @@ def assimilate_global(self, model_params: ModelParams, tokens: torch.Tensor) -> return tokens ######################################### - def forecast(self, model_params: ModelParams, tokens: torch.Tensor, fstep: int) -> torch.Tensor: + def forecast(self, model_params: ModelParams, tokens: torch.Tensor) -> torch.Tensor: """Advances latent space representation in time Args: @@ -806,7 +806,7 @@ def forecast(self, model_params: ModelParams, tokens: torch.Tensor, fstep: int) """ for it, block in enumerate(self.fe_blocks): - aux_info = torch.tensor([fstep], dtype=torch.float32, device="cuda") + aux_info = torch.tensor([it], dtype=torch.float32, device="cuda") tokens = checkpoint(block, tokens, aux_info, use_reentrant=False) return tokens From 75749df7dd146a7ca1bb23e107d21940cf927e32 Mon Sep 17 00:00:00 2001 From: Jubeku Date: Thu, 23 Oct 2025 10:41:00 +0200 Subject: [PATCH 18/19] rename grad_norm log names to exclude from MLFlow --- src/weathergen/train/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/weathergen/train/trainer.py b/src/weathergen/train/trainer.py index b8c189319..75732cdb4 100644 --- a/src/weathergen/train/trainer.py +++ b/src/weathergen/train/trainer.py @@ -994,10 +994,10 @@ def _log_instant_grad_norms(self, stage: Stage): Log instantaneous grad norms, we do not average because of the cost and because we want to measure the actual values. """ - grad_norms = {"total_grad_norm": self.last_grad_norm} + grad_norms = {"grad_norm.total": self.last_grad_norm} for name, param in self.model.named_parameters(): if param.grad is not None: - grad_norms["grad_norm_" + name] = self._get_tensor_item(param.grad.norm()) + grad_norms["grad_norm." + name] = self._get_tensor_item(param.grad.norm()) if is_root(): self.train_logger.log_metrics(stage, grad_norms) From f1c24fa1891979e102c46bfd36c38054835a010d Mon Sep 17 00:00:00 2001 From: Jubeku Date: Fri, 24 Oct 2025 15:32:25 +0200 Subject: [PATCH 19/19] add log_grad_norms to default config --- config/default_config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/config/default_config.yml b/config/default_config.yml index 679f58dd3..620f5c4ae 100644 --- a/config/default_config.yml +++ b/config/default_config.yml @@ -133,6 +133,7 @@ grad_clip: 1.0 weight_decay: 0.1 norm_type: "LayerNorm" nn_module: "te" +log_grad_norms: False start_date: 197901010000 end_date: 202012310000