diff --git a/mlperf_logging/result_summarizer/compute_score/README.md b/mlperf_logging/result_summarizer/compute_score/README.md new file mode 100644 index 0000000..9c35f77 --- /dev/null +++ b/mlperf_logging/result_summarizer/compute_score/README.md @@ -0,0 +1,30 @@ +# MLPerf compute standalone score + +MLPerf compute standalone score + +## Usage + +To compute the scores of a single benchmark. All the results files are assumed to be in the same folder: + +```sh +python3 -m mlperf_logging.result_summarizer.compute_score --benchmark BENCHMARK \ + --system SYSTEM_NAME --benchmark_folder BENCHMARK_FOLDER --usage USAGE --ruleset RULESET \ + [--is_weak_scaling] [--scale] [--has_power] +``` + + +**BENCHMARK:** Name of the benchmark to compute the score such as rgat, llama31_8b, etc. +**SYSTEM_NAME:** Optional system name. +**BENCHMARK_FOLDER:** Folder containing all the results files of the benchmark. +**USAGE:** Either "training" or "hpc", +**RULESET:** Version of the rules that applies one of "1.0.0", "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0", "4.1.0", "5.0.0", "5.1.0". +**[--is_weak_scaling]:** Is the benchmark weak scaling (only applies to HPC). +**[--scale]:** Compute the scaling.json file (only if the folder does not contain it already). +**[--has_power]:** Have the results power measurements . + + + +## Tested software versions +Tested and confirmed working using the following software versions: + +Python 3.9.18 diff --git a/mlperf_logging/result_summarizer/compute_score/__init__.py b/mlperf_logging/result_summarizer/compute_score/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mlperf_logging/result_summarizer/compute_score/__main__.py b/mlperf_logging/result_summarizer/compute_score/__main__.py new file mode 100644 index 0000000..4fb78f4 --- /dev/null +++ b/mlperf_logging/result_summarizer/compute_score/__main__.py @@ -0,0 +1,188 @@ +from .. import result_summarizer +from ...rcp_checker import rcp_checker +from ...compliance_checker.mlp_compliance import usage_choices, rule_choices +from ...compliance_checker.mlp_parser import parse_file +from ...benchmark_meta import get_result_file_counts +import argparse +import glob +import json +import os + + +def get_compute_args(): + parser = argparse.ArgumentParser( + prog="mlperf_logging.result_summarizer.compute_score", + description="Compute the score of a single benchmark", + ) + parser.add_argument("--system", type=str, help="System name", default=None) + parser.add_argument( + "--has_power", action="store_true", help="Compute power score as well" + ) + parser.add_argument( + "--benchmark_folder", + type=str, + help="Folder containing all the result files", + required=True, + ) + parser.add_argument( + "--usage", + type=str, + default="training", + choices=usage_choices(), + help="the usage such as training, hpc, inference_edge, inference_server", + required=True, + ) + parser.add_argument( + "--ruleset", + type=str, + choices=rule_choices(), + help="the ruleset such as 0.6.0, 0.7.0, or 1.0.0", + required=True, + ) + parser.add_argument( + "--is_weak_scaling", action="store_true", help="Compute weak scaling score" + ) + parser.add_argument( + "--scale", action="store_true", help="Compute the scaling factor" + ) + + return parser.parse_args() + + +def print_benchmark_info(args, benchmark): + print("INFO -------------------------------------------------------") + print(f"MLPerf {args.usage}") + print(f"Folder: {args.benchmark_folder}") + print(f"Version: {args.ruleset}") + print(f"System: {args.system}") + print(f"Benchmark: {benchmark}") + print("-------------------------------------------------------------") + + +def _reset_scaling(results_dir): + filepath = results_dir + "/scaling.json" + if os.path.exists(filepath): + os.remove(filepath) + + +def _get_scaling_factor(results_dir): + scaling_factor = 1.0 + scaling_file = results_dir + "/scaling.json" + if os.path.exists(scaling_file): + with open(scaling_file, "r") as f: + contents = json.load(f) + scaling_factor = contents["scaling_factor"] + return scaling_factor + + +def _find_benchmark(result_file, ruleset): + loglines, _ = parse_file(result_file, ruleset) + benchmark = None + for logline in loglines: + if logline.key == "submission_benchmark": + benchmark = logline.value["value"] + break + if benchmark is None: + raise ValueError("Benchmark not specified in result file") + return benchmark + + +def _epochs_samples_to_converge(result_file, ruleset): + loglines, _ = parse_file(result_file, ruleset) + epoch_num = None + samples_count = None + for logline in loglines: + if logline.key == "eval_accuracy": + if "epoch_num" in logline.value["metadata"]: + epoch_num = logline.value["metadata"]["epoch_num"] + if "samples_count" in logline.value["metadata"]: + samples_count = logline.value["metadata"]["samples_count"] + if samples_count is not None: + return samples_count + if epoch_num is not None: + return epoch_num + raise ValueError( + "Not enough values specified in result file. One of ('samples_count')" + "or ('epoch_num') is needed" + ) + + +args = get_compute_args() +_reset_scaling(args.benchmark_folder) +pattern = "{folder}/result_*.txt".format(folder=args.benchmark_folder) +result_files = glob.glob(pattern, recursive=True) +benchmark = _find_benchmark(result_files[0], args.ruleset) +required_runs = get_result_file_counts(args.usage)[benchmark] +if required_runs > len(result_files): + print( + f"WARNING: Not enough runs found for an official submission." + f" Found: {len(result_files)}, required: {required_runs}" + ) + +if args.scale: + rcp_checker.check_directory( + args.benchmark_folder, + args.usage, + args.ruleset, + False, + False, + rcp_file=None, + rcp_pass="pruned_rcps", + rcp_bypass=False, + set_scaling=True, + ) + +scaling_factor = _get_scaling_factor(args.benchmark_folder) + +if args.is_weak_scaling: + scores, power_scores = result_summarizer._compute_weak_score_standalone( + benchmark, + args.system, + args.has_power, + args.benchmark_folder, + args.usage, + args.ruleset, + ) + print_benchmark_info(args, benchmark) + print(f"Scores: {scores}") + if power_scores: + print(f"Power Scores - Energy (kJ): {power_scores}") +else: + scores_track, power_scores_track, score, power_score = ( + result_summarizer._compute_strong_score_standalone( + benchmark, + args.system, + args.has_power, + args.benchmark_folder, + args.usage, + args.ruleset, + return_full_scores=True, + ) + ) + print_benchmark_info(args, benchmark) + mean_score = 0 + for file, s in scores_track.items(): + epochs_samples_to_converge = _epochs_samples_to_converge(file, args.ruleset) + print( + f"Score - Time to Train (minutes) for {file}: {s}. Samples/Epochs to converge: {epochs_samples_to_converge}" + ) + mean_score += s + mean_score /= len(result_files) + mean_score *= scaling_factor + if required_runs > len(result_files): + print("WARNING: Olympic scoring skipped") + print(f"Final score - Time to Train (minutes): {mean_score}") + else: + print(f"Final score - Time to Train (minutes): {score}") + if power_score: + mean_power = 0 + for file, ps in power_scores_track.items(): + print(f"Power Score - Energy (kJ) for {file}: {ps}") + mean_power += ps + mean_power /= len(result_files) + mean_power *= scaling_factor + if required_runs > len(result_files): + print("WARNING: Olympic scoring skipped") + print(f"Final score - Time to Train (minutes): {mean_power}") + else: + print(f"Power Score - Energy (kJ): {power_score}") diff --git a/mlperf_logging/result_summarizer/result_summarizer.py b/mlperf_logging/result_summarizer/result_summarizer.py index 194b636..1cd3fb7 100644 --- a/mlperf_logging/result_summarizer/result_summarizer.py +++ b/mlperf_logging/result_summarizer/result_summarizer.py @@ -324,6 +324,154 @@ def _get_scaling_factor(folder): return scaling_factor +def _compute_strong_score_standalone( + benchmark, + system, + has_power, + benchmark_folder, + usage, + ruleset, + desc={"submitter": None}, + return_full_scores=False, +): + pattern = "{folder}/result_*.txt".format(folder=benchmark_folder) + result_files = glob.glob(pattern, recursive=True) + scores = [] + scores_track = {} + power_scores = [] + power_scores_track = {} + dropped_scores = 0 + for result_file in result_files: + try: + loglines = _read_result_file(result_file, usage, ruleset) + start, stop = _query_run_start_stop(loglines) + time_to_train_ms = stop - start + scores.append(time_to_train_ms / 60 / 1000) + scores_track[result_file] = scores[-1] + except ValueError as e: + print("{} in {}".format(e, result_file)) + dropped_scores += 1 + continue + if has_power: + power_scores.append( + _compute_total_power( + benchmark_folder, result_file, time_to_train_ms, ruleset + ) + ) + power_scores_track[result_file] = power_scores[-1] + max_dropped_scores = 4 if benchmark == "unet3d" else 1 + if dropped_scores > max_dropped_scores: + print( + "CRITICAL ERROR: Too many non-converging runs " + "for {} {}/{}".format(desc["submitter"], system, benchmark) + ) + print( + "** CRITICAL ERROR ** Results in the table for {} {}/{} are " + "NOT correct".format(desc["submitter"], system, benchmark) + ) + elif dropped_scores >= 1: + print( + "NOTICE: Dropping non-converged run(s) for {} {}/{} using " + "olympic scoring.".format( + desc["submitter"], + system, + benchmark, + ) + ) + + if has_power: + unsorted_scores = scores.copy() + + score = None + scaling_factor = _get_scaling_factor(benchmark_folder) + if dropped_scores <= max_dropped_scores: + olympic_avg = _compute_olympic_average( + scores, dropped_scores, max_dropped_scores + ) + if olympic_avg is not None: + score = olympic_avg + score *= scaling_factor + + power_score = None + if has_power and dropped_scores <= max_dropped_scores: + index = [i[0] for i in sorted(enumerate(unsorted_scores), key=lambda x: x[1])] + olympic_avg = _index_olympic_average( + power_scores, index, dropped_scores, max_dropped_scores + ) + if olympic_avg is not None: + power_score = olympic_avg + power_score *= scaling_factor + if return_full_scores: + return scores_track, power_scores_track, score, power_score + return score, power_score + + +def _compute_weak_score_standalone(benchmark, system, has_power, benchmark_folder, usage, ruleset, desc = {"submitter": None}): + power_scores = [] + # Read scores from result files. + pattern = '{folder}/result_*.txt'.format(folder=benchmark_folder) + result_files = glob.glob(pattern, recursive=True) + global_start, global_stop = float('inf'), float('-inf') + number_of_models = 0 + instance_scale = None + for result_file in result_files: + try: + loglines = _read_result_file(result_file, usage, ruleset) + start, stop = _query_run_start_stop(loglines) + global_start = min(global_start, start) + global_stop = max(global_stop, stop) + number_of_models += 1 + if instance_scale == None: + instance_scale = _query_instance_scale(loglines) + else: + assert instance_scale == _query_instance_scale(loglines) + except ValueError as e: + print('{} in {}'.format(e, result_file)) + continue + if has_power: + time_to_train_ms = stop - start + power_scores.append(_compute_total_power(benchmark_folder, result_file, time_to_train_ms, ruleset)) + + scores = {} + power = {} + if number_of_models >= get_result_file_counts(usage)[benchmark]: + scores['{}:{}'.format( + benchmark, + 'time_to_train_all', + )] = (global_stop - global_start) / 60 / 1000 + scores['{}:{}'.format( + benchmark, + 'number_of_models', + )] = number_of_models + scores['{}:{}'.format( + benchmark, + 'instance_scale', + )] = instance_scale + else: + print('CRITICAL ERROR: Not enough converging weak scaling runs ' + 'for {} {}/{}'.format(desc['submitter'], system, benchmark)) + + if has_power: + olympic_avg = _compute_olympic_average( + power_scores, 1, 1) + if olympic_avg is not None: + power['{}:{}'.format( + benchmark, + 'time_to_train_all', + )] = olympic_avg + power['{}:{}'.format( + benchmark, + 'number_of_models', + )] = olympic_avg + power['{}:{}'.format( + benchmark, + 'instance_scale', + )] = olympic_avg + + return scores, power + + + def _compute_strong_scaling_scores(desc, system_folder, usage, ruleset): # Collect scores for benchmarks. benchmark_scores = {} @@ -340,55 +488,11 @@ def _compute_strong_scaling_scores(desc, system_folder, usage, ruleset): benchmark = _benchmark_alias(folder_parts[-1]) system = folder_parts[-3] if usage == 'hpc' else folder_parts[-2] # Read scores from result files. - pattern = '{folder}/result_*.txt'.format(folder=benchmark_folder) - result_files = glob.glob(pattern, recursive=True) - scores = [] - power_scores = [] - dropped_scores = 0 - for result_file in result_files: - try: - loglines = _read_result_file(result_file, usage, ruleset) - start, stop = _query_run_start_stop(loglines) - time_to_train_ms = stop - start - scores.append(time_to_train_ms / 60 / 1000) - except ValueError as e: - print('{} in {}'.format(e, result_file)) - dropped_scores += 1 - continue - if has_power: - power_scores.append(_compute_total_power(benchmark_folder, result_file, time_to_train_ms, ruleset)) - max_dropped_scores = 4 if benchmark == 'unet3d' else 1 - if dropped_scores > max_dropped_scores: - print('CRITICAL ERROR: Too many non-converging runs ' - 'for {} {}/{}'.format(desc['submitter'], system, benchmark)) - print('** CRITICAL ERROR ** Results in the table for {} {}/{} are ' - 'NOT correct'.format(desc['submitter'], system, benchmark)) - elif dropped_scores >= 1: - print('NOTICE: Dropping non-converged run(s) for {} {}/{} using ' - 'olympic scoring.'.format( - desc['submitter'], - system, - benchmark, - )) - - if has_power: - unsorted_scores = scores.copy() - - scaling_factor = _get_scaling_factor(benchmark_folder) - if dropped_scores <= max_dropped_scores: - olympic_avg = _compute_olympic_average( - scores, dropped_scores, max_dropped_scores) - if olympic_avg is not None: - benchmark_scores[benchmark] = olympic_avg - benchmark_scores[benchmark] *= scaling_factor - - if has_power and dropped_scores <= max_dropped_scores: - index = [i[0] for i in sorted(enumerate(unsorted_scores), key=lambda x:x[1])] - olympic_avg = _index_olympic_average( - power_scores, index, dropped_scores, max_dropped_scores) - if olympic_avg is not None: - benchmark_power_scores[benchmark] = olympic_avg - benchmark_power_scores[benchmark] *= scaling_factor + score, power_score = _compute_strong_score_standalone(benchmark, system, has_power, benchmark_folder, usage, ruleset, desc) + if score is not None: + benchmark_scores[benchmark] = score + if power_score is not None: + benchmark_power_scores[benchmark] = power_score _fill_empty_benchmark_scores(benchmark_scores, usage, ruleset) if len(benchmark_power_scores) > 0: _fill_empty_benchmark_scores(benchmark_power_scores, usage, ruleset) @@ -426,64 +530,53 @@ def _compute_weak_scaling_scores(desc, system_folder, usage, ruleset): system = folder_parts[-3] # Check if this benchmark has power results has_power = _has_power(benchmark_folder) - power_scores = [] - # Read scores from result files. - pattern = '{folder}/result_*.txt'.format(folder=benchmark_folder) - result_files = glob.glob(pattern, recursive=True) - global_start, global_stop = float('inf'), float('-inf') - number_of_models = 0 - instance_scale = None - for result_file in result_files: - try: - loglines = _read_result_file(result_file, usage, ruleset) - start, stop = _query_run_start_stop(loglines) - global_start = min(global_start, start) - global_stop = max(global_stop, stop) - number_of_models += 1 - if instance_scale == None: - instance_scale = _query_instance_scale(loglines) - else: - assert instance_scale == _query_instance_scale(loglines) - except ValueError as e: - print('{} in {}'.format(e, result_file)) - continue - if has_power: - time_to_train_ms = stop - start - power_scores.append(_compute_total_power(benchmark_folder, result_file, time_to_train_ms, ruleset)) - - if number_of_models >= get_result_file_counts(usage)[benchmark]: + scores, power_scores = _compute_weak_score_standalone(benchmark, system, has_power, benchmark_folder, usage, ruleset, desc) + + if scores: benchmark_scores['{}:{}'.format( benchmark, 'time_to_train_all', - )] = (global_stop - global_start) / 60 / 1000 + )] = scores['{}:{}'.format( + benchmark, + 'time_to_train_all', + )] benchmark_scores['{}:{}'.format( benchmark, 'number_of_models', - )] = number_of_models + )] = scores['{}:{}'.format( + benchmark, + 'number_of_models', + )] benchmark_scores['{}:{}'.format( benchmark, 'instance_scale', - )] = instance_scale - else: - print('CRITICAL ERROR: Not enough converging weak scaling runs ' - 'for {} {}/{}'.format(desc['submitter'], system, benchmark)) + )] = scores['{}:{}'.format( + benchmark, + 'instance_scale', + )] - if has_power: - olympic_avg = _compute_olympic_average( - power_scores, 1, 1) - if olympic_avg is not None: - benchmark_power_scores['{}:{}'.format( - benchmark, - 'time_to_train_all', - )] = olympic_avg - benchmark_power_scores['{}:{}'.format( - benchmark, - 'number_of_models', - )] = olympic_avg - benchmark_power_scores['{}:{}'.format( - benchmark, - 'instance_scale', - )] = olympic_avg + if power_scores: + benchmark_power_scores['{}:{}'.format( + benchmark, + 'time_to_train_all', + )] = power_scores['{}:{}'.format( + benchmark, + 'time_to_train_all', + )] + benchmark_power_scores['{}:{}'.format( + benchmark, + 'number_of_models', + )] = power_scores['{}:{}'.format( + benchmark, + 'number_of_models', + )] + benchmark_power_scores['{}:{}'.format( + benchmark, + 'instance_scale', + )] = power_scores['{}:{}'.format( + benchmark, + 'instance_scale', + )] _fill_empty_benchmark_scores(benchmark_scores, usage,