diff --git a/README.md b/README.md index 27fcca7..eb59ed0 100644 --- a/README.md +++ b/README.md @@ -71,8 +71,8 @@ The recommended settings are based on the lowest energy consumption (Watt-min) f ## File Descriptions -- `training_stats.csv`: Contains columns such as `max_watt`, `tokens_per_sec`, `temperature`, `gpu_utilization`, `memory_utilization`, `loss`, and `timestamp`. -- `inference_stats.csv`: Contains columns such as `max_watt`, `tokens_per_sec`, `temperature`, `gpu_utilization`, `memory_utilization`, and `timestamp`. +- `training_stats.csv`: Contains columns such as `max_watt`, `tokens_per_sec`, `avg_tokens_per_sec`, `temperature`, `gpu_utilization`, `memory_utilization`, `loss`, and `timestamp`. +- `inference_stats.csv`: Contains columns such as `max_watt`, `tokens_per_sec`, `avg_tokens_per_sec`, `temperature`, `gpu_utilization`, `memory_utilization`, and `timestamp`. - `generate_report.py`: - Loads data from CSV files. - Cleans data by removing outliers. diff --git a/llm_inference.py b/llm_inference.py index ef3b6de..7a3434d 100644 --- a/llm_inference.py +++ b/llm_inference.py @@ -62,7 +62,7 @@ def load_across_gpus(seq_length, batch_size, model_variant, max_iterations, call # Get sample GPU metrics to dynamically generate headers sample_metrics = get_gpu_metrics()[0] gpu_headers = list(sample_metrics.keys()) - headers = ['timestamp', 'tokens_per_sec'] + gpu_headers + ['max_watt'] + headers = ['timestamp', 'tokens_per_sec', 'avg_tokens_per_sec'] + gpu_headers + ['max_watt'] for iteration in range(max_iterations): model.eval() @@ -87,11 +87,12 @@ def load_across_gpus(seq_length, batch_size, model_variant, max_iterations, call # Log statistics after each iteration timestamp = datetime.now().isoformat() - tokens_per_sec = total_tokens / (time.time() - start_time) + tokens_per_sec = actual_tokens / batch_time if batch_time > 0 else 0 + avg_tokens_per_sec = total_tokens / (time.time() - start_time) from gpu_metrics_utils import collect_power_draw_all_gpus total_power = collect_power_draw_all_gpus() gpu_metrics = get_gpu_metrics()[0] - data = [timestamp, tokens_per_sec] + list(gpu_metrics.values()) + [MAX_WATT, total_power] + data = [timestamp, tokens_per_sec, avg_tokens_per_sec] + list(gpu_metrics.values()) + [MAX_WATT, total_power] if callback: data = callback(data) log_statistics(LOG_FILE, headers + ['total_power_draw'], data) diff --git a/llm_training.py b/llm_training.py index db8f5b0..d14660d 100644 --- a/llm_training.py +++ b/llm_training.py @@ -73,7 +73,7 @@ def load_across_gpus(gpu_ids, batch_size, seq_length, epochs, learning_rate, cal # Get sample GPU metrics to dynamically generate headers sample_metrics = get_gpu_metrics()[0] gpu_headers = list(sample_metrics.keys()) - headers = ['timestamp', 'epoch', 'iteration', 'batch', 'loss', 'tokens_per_sec'] + gpu_headers + ['max_watt'] + headers = ['timestamp', 'epoch', 'iteration', 'batch', 'loss', 'tokens_per_sec', 'avg_tokens_per_sec'] + gpu_headers + ['max_watt'] model.train() for epoch in range(epochs): @@ -97,11 +97,12 @@ def load_across_gpus(gpu_ids, batch_size, seq_length, epochs, learning_rate, cal # Log statistics after each batch timestamp = datetime.now().isoformat() - tokens_per_sec = total_tokens / (time.time() - start_time) + tokens_per_sec = batch_labels.numel() / batch_time if batch_time > 0 else 0 + avg_tokens_per_sec = total_tokens / (time.time() - start_time) from gpu_metrics_utils import collect_power_draw_all_gpus total_power = collect_power_draw_all_gpus() gpu_metrics = get_gpu_metrics()[0] - data = [timestamp, epoch + 1, iteration, i // batch_size + 1, loss.item(), tokens_per_sec] + list(gpu_metrics.values()) + [MAX_WATT, total_power] + data = [timestamp, epoch + 1, iteration, i // batch_size + 1, loss.item(), tokens_per_sec, avg_tokens_per_sec] + list(gpu_metrics.values()) + [MAX_WATT, total_power] if callback: data = callback(data) log_statistics(log_file, headers + ['total_power_draw'], data)