diff --git a/README.md b/README.md index 27fcca7..2245de8 100644 --- a/README.md +++ b/README.md @@ -67,12 +67,12 @@ The output image `report.png` contains the following: ## Explanation of Recommendations -The recommended settings are based on the lowest energy consumption (Watt-min) for each scenario. Energy consumption is calculated as the product of power draw and total time taken. +The recommended settings are based on the lowest energy consumption (Watt-min) for each scenario. Energy consumption is calculated as the product of power draw and total time taken. The scripts also record **energy per token**, calculated as the instantaneous power draw divided by the token generation rate. ## File Descriptions -- `training_stats.csv`: Contains columns such as `max_watt`, `tokens_per_sec`, `temperature`, `gpu_utilization`, `memory_utilization`, `loss`, and `timestamp`. -- `inference_stats.csv`: Contains columns such as `max_watt`, `tokens_per_sec`, `temperature`, `gpu_utilization`, `memory_utilization`, and `timestamp`. +- `training_stats.csv`: Contains columns such as `max_watt`, `tokens_per_sec`, `total_power_draw`, `energy_per_token`, `temperature`, `gpu_utilization`, `memory_utilization`, `loss`, and `timestamp`. +- `inference_stats.csv`: Contains columns such as `max_watt`, `tokens_per_sec`, `total_power_draw`, `energy_per_token`, `temperature`, `gpu_utilization`, `memory_utilization`, and `timestamp`. - `generate_report.py`: - Loads data from CSV files. - Cleans data by removing outliers. diff --git a/generate_report.py b/generate_report.py index 152089f..09f695e 100644 --- a/generate_report.py +++ b/generate_report.py @@ -14,6 +14,16 @@ training_stats['timestamp'] = pd.to_datetime(training_stats['timestamp']) inference_stats['timestamp'] = pd.to_datetime(inference_stats['timestamp']) +# Derive energy_per_token if missing +if 'energy_per_token' not in inference_stats.columns: + inference_stats['energy_per_token'] = ( + inference_stats['total_power_draw'] / inference_stats['tokens_per_sec'] + ) +if 'energy_per_token' not in training_stats.columns: + training_stats['energy_per_token'] = ( + training_stats['total_power_draw'] / training_stats['tokens_per_sec'] + ) + # Calculate total time for each row inference_stats['time_diff'] = inference_stats['timestamp'].diff().dt.total_seconds().fillna(0) training_stats['time_diff'] = training_stats['timestamp'].diff().dt.total_seconds().fillna(0) @@ -30,7 +40,8 @@ def remove_outliers(df, column): 'temperature': 'mean', 'gpu_utilization': 'mean', 'memory_utilization': 'mean', - 'time_diff': 'sum' + 'time_diff': 'sum', + 'energy_per_token': 'mean' }).reset_index() # Training Metrics @@ -41,7 +52,8 @@ def remove_outliers(df, column): 'gpu_utilization': 'mean', 'memory_utilization': 'mean', 'loss': 'mean', - 'time_diff': 'sum' + 'time_diff': 'sum', + 'energy_per_token': 'mean' }).reset_index() # Generate summary tables @@ -82,7 +94,7 @@ def plot_smooth_curve(ax, x, y, title, xlabel, ylabel, highlight_x=None): ax.grid(True) # Plotting the updated charts without outliers -fig, axs = plt.subplots(6, 2, figsize=(20, 30)) +fig, axs = plt.subplots(7, 2, figsize=(20, 35)) # Add header with current date and time fig.suptitle(f'Performance Metrics and Recommendations\nGenerated on: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}', fontsize=16) @@ -103,6 +115,9 @@ def plot_smooth_curve(ax, x, y, title, xlabel, ylabel, highlight_x=None): plot_smooth_curve(axs[4, 0], inference_grouped['max_watt'], inference_grouped['time_diff'], 'Max Power vs. Total Time (Inference)', 'Max Power (W)', 'Total Time (seconds)', highlight_x=optimal_inference_watt) +plot_smooth_curve(axs[5, 0], inference_grouped['max_watt'], inference_grouped['energy_per_token'], + 'Max Power vs. Energy per Token (Inference)', 'Max Power (W)', 'Energy per Token (W*s/token)', highlight_x=optimal_inference_watt) + # Training Metrics plot_smooth_curve(axs[0, 1], training_grouped['max_watt'], training_grouped['tokens_per_sec'], 'Max Power vs. Tokens per Second (Training)', 'Max Power (W)', 'Tokens per Second', highlight_x=optimal_training_watt) @@ -119,18 +134,22 @@ def plot_smooth_curve(ax, x, y, title, xlabel, ylabel, highlight_x=None): plot_smooth_curve(axs[4, 1], training_grouped['max_watt'], training_grouped['time_diff'], 'Max Power vs. Total Time (Training)', 'Max Power (W)', 'Total Time (seconds)', highlight_x=optimal_training_watt) +plot_smooth_curve(axs[5, 1], training_grouped['max_watt'], training_grouped['energy_per_token'], + 'Max Power vs. Energy per Token (Training)', 'Max Power (W)', 'Energy per Token (W*s/token)', highlight_x=optimal_training_watt) + # Summary with recommended settings summary_text = ( f"Recommended Settings:\n" f"Optimal Max Power for Training: {optimal_training_watt}W\n" f"Optimal Max Power for Inference: {optimal_inference_watt}W\n\n" "Recommendations are based on the lowest energy consumption (Watt-min) for each scenario.\n" - "Energy consumption is calculated as the product of power draw and total time taken." + "Energy consumption is calculated as the product of power draw and total time taken.\n" + "Energy per token is the instantaneous power divided by token generation rate." ) -axs[5, 0].axis('off') -axs[5, 1].text(0.5, 0.5, summary_text, ha='center', va='center', fontsize=12, wrap=True) -axs[5, 1].axis('off') +axs[6, 0].axis('off') +axs[6, 1].text(0.5, 0.5, summary_text, ha='center', va='center', fontsize=12, wrap=True) +axs[6, 1].axis('off') plt.tight_layout(rect=[0, 0, 1, 0.96]) plt.savefig('report.png') diff --git a/llm_inference.py b/llm_inference.py index ef3b6de..8f2f309 100644 --- a/llm_inference.py +++ b/llm_inference.py @@ -62,7 +62,9 @@ def load_across_gpus(seq_length, batch_size, model_variant, max_iterations, call # Get sample GPU metrics to dynamically generate headers sample_metrics = get_gpu_metrics()[0] gpu_headers = list(sample_metrics.keys()) - headers = ['timestamp', 'tokens_per_sec'] + gpu_headers + ['max_watt'] + headers = ['timestamp', 'tokens_per_sec'] + gpu_headers + [ + 'max_watt', 'total_power_draw', 'energy_per_token' + ] for iteration in range(max_iterations): model.eval() @@ -91,10 +93,18 @@ def load_across_gpus(seq_length, batch_size, model_variant, max_iterations, call from gpu_metrics_utils import collect_power_draw_all_gpus total_power = collect_power_draw_all_gpus() gpu_metrics = get_gpu_metrics()[0] - data = [timestamp, tokens_per_sec] + list(gpu_metrics.values()) + [MAX_WATT, total_power] + energy_per_token = total_power / tokens_per_sec if tokens_per_sec else 0 + data = [ + timestamp, + tokens_per_sec, + *list(gpu_metrics.values()), + MAX_WATT, + total_power, + energy_per_token, + ] if callback: data = callback(data) - log_statistics(LOG_FILE, headers + ['total_power_draw'], data) + log_statistics(LOG_FILE, headers, data) logger.info(f"Logged statistics: {data}") shutdown_nvml() diff --git a/llm_training.py b/llm_training.py index db8f5b0..744860e 100644 --- a/llm_training.py +++ b/llm_training.py @@ -73,7 +73,9 @@ def load_across_gpus(gpu_ids, batch_size, seq_length, epochs, learning_rate, cal # Get sample GPU metrics to dynamically generate headers sample_metrics = get_gpu_metrics()[0] gpu_headers = list(sample_metrics.keys()) - headers = ['timestamp', 'epoch', 'iteration', 'batch', 'loss', 'tokens_per_sec'] + gpu_headers + ['max_watt'] + headers = ['timestamp', 'epoch', 'iteration', 'batch', 'loss', 'tokens_per_sec'] + gpu_headers + [ + 'max_watt', 'total_power_draw', 'energy_per_token' + ] model.train() for epoch in range(epochs): @@ -101,10 +103,22 @@ def load_across_gpus(gpu_ids, batch_size, seq_length, epochs, learning_rate, cal from gpu_metrics_utils import collect_power_draw_all_gpus total_power = collect_power_draw_all_gpus() gpu_metrics = get_gpu_metrics()[0] - data = [timestamp, epoch + 1, iteration, i // batch_size + 1, loss.item(), tokens_per_sec] + list(gpu_metrics.values()) + [MAX_WATT, total_power] + energy_per_token = total_power / tokens_per_sec if tokens_per_sec else 0 + data = [ + timestamp, + epoch + 1, + iteration, + i // batch_size + 1, + loss.item(), + tokens_per_sec, + *list(gpu_metrics.values()), + MAX_WATT, + total_power, + energy_per_token, + ] if callback: data = callback(data) - log_statistics(log_file, headers + ['total_power_draw'], data) + log_statistics(log_file, headers, data) logger.info(f"Logged statistics: {data}") shutdown_nvml() diff --git a/recommend.py b/recommend.py index a2ae8ad..ed66dd4 100644 --- a/recommend.py +++ b/recommend.py @@ -8,6 +8,15 @@ training_stats['timestamp'] = pd.to_datetime(training_stats['timestamp']) inference_stats['timestamp'] = pd.to_datetime(inference_stats['timestamp']) +if 'energy_per_token' not in inference_stats.columns: + inference_stats['energy_per_token'] = ( + inference_stats['total_power_draw'] / inference_stats['tokens_per_sec'] + ) +if 'energy_per_token' not in training_stats.columns: + training_stats['energy_per_token'] = ( + training_stats['total_power_draw'] / training_stats['tokens_per_sec'] + ) + # Function to calculate summary statistics def calculate_summary(data): numeric_data = data.select_dtypes(include='number')