Skip to content

Log per-batch throughput #10

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ The recommended settings are based on the lowest energy consumption (Watt-min) f

## File Descriptions

- `training_stats.csv`: Contains columns such as `max_watt`, `tokens_per_sec`, `temperature`, `gpu_utilization`, `memory_utilization`, `loss`, and `timestamp`.
- `inference_stats.csv`: Contains columns such as `max_watt`, `tokens_per_sec`, `temperature`, `gpu_utilization`, `memory_utilization`, and `timestamp`.
- `training_stats.csv`: Contains columns such as `max_watt`, `tokens_per_sec`, `avg_tokens_per_sec`, `temperature`, `gpu_utilization`, `memory_utilization`, `loss`, and `timestamp`.
- `inference_stats.csv`: Contains columns such as `max_watt`, `tokens_per_sec`, `avg_tokens_per_sec`, `temperature`, `gpu_utilization`, `memory_utilization`, and `timestamp`.
- `generate_report.py`:
- Loads data from CSV files.
- Cleans data by removing outliers.
Expand Down
7 changes: 4 additions & 3 deletions llm_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def load_across_gpus(seq_length, batch_size, model_variant, max_iterations, call
# Get sample GPU metrics to dynamically generate headers
sample_metrics = get_gpu_metrics()[0]
gpu_headers = list(sample_metrics.keys())
headers = ['timestamp', 'tokens_per_sec'] + gpu_headers + ['max_watt']
headers = ['timestamp', 'tokens_per_sec', 'avg_tokens_per_sec'] + gpu_headers + ['max_watt']

for iteration in range(max_iterations):
model.eval()
Expand All @@ -87,11 +87,12 @@ def load_across_gpus(seq_length, batch_size, model_variant, max_iterations, call

# Log statistics after each iteration
timestamp = datetime.now().isoformat()
tokens_per_sec = total_tokens / (time.time() - start_time)
tokens_per_sec = actual_tokens / batch_time if batch_time > 0 else 0
avg_tokens_per_sec = total_tokens / (time.time() - start_time)
from gpu_metrics_utils import collect_power_draw_all_gpus
total_power = collect_power_draw_all_gpus()
gpu_metrics = get_gpu_metrics()[0]
data = [timestamp, tokens_per_sec] + list(gpu_metrics.values()) + [MAX_WATT, total_power]
data = [timestamp, tokens_per_sec, avg_tokens_per_sec] + list(gpu_metrics.values()) + [MAX_WATT, total_power]
if callback:
data = callback(data)
log_statistics(LOG_FILE, headers + ['total_power_draw'], data)
Expand Down
7 changes: 4 additions & 3 deletions llm_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def load_across_gpus(gpu_ids, batch_size, seq_length, epochs, learning_rate, cal
# Get sample GPU metrics to dynamically generate headers
sample_metrics = get_gpu_metrics()[0]
gpu_headers = list(sample_metrics.keys())
headers = ['timestamp', 'epoch', 'iteration', 'batch', 'loss', 'tokens_per_sec'] + gpu_headers + ['max_watt']
headers = ['timestamp', 'epoch', 'iteration', 'batch', 'loss', 'tokens_per_sec', 'avg_tokens_per_sec'] + gpu_headers + ['max_watt']

model.train()
for epoch in range(epochs):
Expand All @@ -97,11 +97,12 @@ def load_across_gpus(gpu_ids, batch_size, seq_length, epochs, learning_rate, cal

# Log statistics after each batch
timestamp = datetime.now().isoformat()
tokens_per_sec = total_tokens / (time.time() - start_time)
tokens_per_sec = batch_labels.numel() / batch_time if batch_time > 0 else 0
avg_tokens_per_sec = total_tokens / (time.time() - start_time)
from gpu_metrics_utils import collect_power_draw_all_gpus
total_power = collect_power_draw_all_gpus()
gpu_metrics = get_gpu_metrics()[0]
data = [timestamp, epoch + 1, iteration, i // batch_size + 1, loss.item(), tokens_per_sec] + list(gpu_metrics.values()) + [MAX_WATT, total_power]
data = [timestamp, epoch + 1, iteration, i // batch_size + 1, loss.item(), tokens_per_sec, avg_tokens_per_sec] + list(gpu_metrics.values()) + [MAX_WATT, total_power]
if callback:
data = callback(data)
log_statistics(log_file, headers + ['total_power_draw'], data)
Expand Down