feat(reth-bench): add gas throughput chart to python script (paradigmxyz#17572)

shekhirin · claude · web-flow · commit 58235419bb85 · 2025-07-22T18:51:11.000Z
Co-authored-by: Claude &lt;noreply@anthropic.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -63,3 +63,8 @@ recipe.json
 _
 # broken links report
 links-report.json
+
+# Python cache
+__pycache__/
+*.py[cod]
+*$py.class
diff --git a/bin/reth-bench/scripts/compare_newpayload_latency.py b/bin/reth-bench/scripts/compare_newpayload_latency.py
@@ -16,32 +16,89 @@
 #
 # - A simple line graph plotting the latencies of the two files against each
 #   other.
+#
+# - A gas per second (gas/s) chart showing throughput over time.
 
 
 import argparse
 import pandas as pd
 import matplotlib.pyplot as plt
 import numpy as np
 import sys
+import os
+from matplotlib.ticker import FuncFormatter
+
+def get_output_filename(base_path, suffix=None):
+    """Generate output filename with optional suffix."""
+    if suffix is None:
+        return base_path
+    
+    # Split the base path into directory, name, and extension
+    dir_name = os.path.dirname(base_path)
+    base_name = os.path.basename(base_path)
+    name, ext = os.path.splitext(base_name)
+    
+    # Create new filename with suffix
+    new_name = f"{name}_{suffix}{ext}"
+    return os.path.join(dir_name, new_name) if dir_name else new_name
+
+def format_gas_units(value, pos):
+    """Format gas values with appropriate units (gas, Kgas, Mgas, Ggas, Tgas)."""
+    if value == 0:
+        return '0'
+    
+    # Define unit thresholds and labels
+    units = [
+        (1e12, 'Tgas'),  # Teragas
+        (1e9, 'Ggas'),   # Gigagas
+        (1e6, 'Mgas'),   # Megagas
+        (1e3, 'Kgas'),   # Kilogas
+        (1, 'gas')       # gas
+    ]
+    
+    abs_value = abs(value)
+    for threshold, unit in units:
+        if abs_value >= threshold:
+            scaled_value = value / threshold
+            # Format with appropriate precision
+            if scaled_value >= 100:
+                return f'{scaled_value:.0f}{unit}/s'
+            elif scaled_value >= 10:
+                return f'{scaled_value:.1f}{unit}/s'
+            else:
+                return f'{scaled_value:.2f}{unit}/s'
+    
+    return f'{value:.0f}gas/s'
+
+def moving_average(data, window_size):
+    """Calculate moving average with given window size."""
+    if window_size <= 1:
+        return data
+    
+    # Use pandas for efficient rolling mean calculation
+    series = pd.Series(data)
+    return series.rolling(window=window_size, center=True, min_periods=1).mean().values
 
 def main():
     parser = argparse.ArgumentParser(description='Generate histogram of total_latency percent differences between two CSV files')
     parser.add_argument('baseline_csv', help='First CSV file, used as the baseline/control')
     parser.add_argument('comparison_csv', help='Second CSV file, which is being compared to the baseline')
     parser.add_argument('-o', '--output', default='latency.png', help='Output image file (default: latency.png)')
-    parser.add_argument('--graphs', default='all', help='Comma-separated list of graphs to plot: histogram, line, all (default: all)')
+    parser.add_argument('--graphs', default='all', help='Comma-separated list of graphs to plot: histogram, line, gas, all (default: all)')
+    parser.add_argument('--average', type=int, metavar='N', help='Apply moving average over N blocks to smooth line and gas charts')
+    parser.add_argument('--separate', action='store_true', help='Output each chart as a separate file')
 
     args = parser.parse_args()
 
     # Parse graph selection
     if args.graphs.lower() == 'all':
-        selected_graphs = {'histogram', 'line'}
+        selected_graphs = {'histogram', 'line', 'gas'}
     else:
         selected_graphs = set(graph.strip().lower() for graph in args.graphs.split(','))
-        valid_graphs = {'histogram', 'line'}
+        valid_graphs = {'histogram', 'line', 'gas'}
         invalid_graphs = selected_graphs - valid_graphs
         if invalid_graphs:
-            print(f"Error: Invalid graph types: {', '.join(invalid_graphs)}. Valid options are: histogram, line, all", file=sys.stderr)
+            print(f"Error: Invalid graph types: {', '.join(invalid_graphs)}. Valid options are: histogram, line, gas, all", file=sys.stderr)
             sys.exit(1)
 
     try:
@@ -62,6 +119,15 @@ def main():
         print(f"Error: 'total_latency' column not found in {args.comparison_csv}", file=sys.stderr)
         sys.exit(1)
 
+    # Check for gas_used column if gas graph is selected
+    if 'gas' in selected_graphs:
+        if 'gas_used' not in df1.columns:
+            print(f"Error: 'gas_used' column not found in {args.baseline_csv} (required for gas graph)", file=sys.stderr)
+            sys.exit(1)
+        if 'gas_used' not in df2.columns:
+            print(f"Error: 'gas_used' column not found in {args.comparison_csv} (required for gas graph)", file=sys.stderr)
+            sys.exit(1)
+
     if len(df1) != len(df2):
         print("Warning: CSV files have different number of rows. Using minimum length.", file=sys.stderr)
         min_len = min(len(df1), len(df2))
@@ -93,23 +159,35 @@ def main():
         print("Error: No valid graphs selected", file=sys.stderr)
         sys.exit(1)
 
-    if num_plots == 1:
-        fig, ax = plt.subplots(1, 1, figsize=(12, 6))
-        axes = [ax]
+    # Store output filenames
+    output_files = []
+    
+    if args.separate:
+        # We'll create individual figures for each graph
+        pass
     else:
-        fig, axes = plt.subplots(num_plots, 1, figsize=(12, 6 * num_plots))
+        # Create combined figure
+        if num_plots == 1:
+            fig, ax = plt.subplots(1, 1, figsize=(12, 6))
+            axes = [ax]
+        else:
+            fig, axes = plt.subplots(num_plots, 1, figsize=(12, 6 * num_plots))
 
     plot_idx = 0
 
     # Plot histogram if selected
     if 'histogram' in selected_graphs:
+        if args.separate:
+            fig, ax = plt.subplots(1, 1, figsize=(12, 6))
+        else:
+            ax = axes[plot_idx]
+            
         min_diff = np.floor(percent_diff.min())
         max_diff = np.ceil(percent_diff.max())
 
         # Create histogram with 1% buckets
         bins = np.arange(min_diff, max_diff + 1, 1)
 
-        ax = axes[plot_idx]
         ax.hist(percent_diff, bins=bins, edgecolor='black', alpha=0.7)
         ax.set_xlabel('Percent Difference (%)')
         ax.set_ylabel('Number of Blocks')
@@ -120,47 +198,169 @@ def main():
         ax.axvline(mean_diff, color='red', linestyle='--', label=f'Mean: {mean_diff:.2f}%')
         ax.axvline(median_diff, color='orange', linestyle='--', label=f'Median: {median_diff:.2f}%')
         ax.legend()
-        plot_idx += 1
+        
+        if args.separate:
+            plt.tight_layout()
+            output_file = get_output_filename(args.output, 'histogram')
+            plt.savefig(output_file, dpi=300, bbox_inches='tight')
+            output_files.append(output_file)
+            plt.close(fig)
+        else:
+            plot_idx += 1
 
     # Plot line graph if selected
     if 'line' in selected_graphs:
+        if args.separate:
+            fig, ax = plt.subplots(1, 1, figsize=(12, 6))
+        else:
+            ax = axes[plot_idx]
+            
         # Determine comparison color based on median change. The median being
         # negative means processing time got faster, so that becomes green.
         comparison_color = 'green' if median_diff < 0 else 'red'
 
-        ax = axes[plot_idx]
+        # Apply moving average if requested
+        plot_latency1 = latency1[:len(percent_diff)]
+        plot_latency2 = latency2[:len(percent_diff)]
+        
+        if args.average:
+            plot_latency1 = moving_average(plot_latency1, args.average)
+            plot_latency2 = moving_average(plot_latency2, args.average)
         if 'block_number' in df1.columns and 'block_number' in df2.columns:
             block_numbers = df1['block_number'].values[:len(percent_diff)]
-            ax.plot(block_numbers, latency1[:len(percent_diff)], 'orange', alpha=0.7, label=f'Baseline ({args.baseline_csv})')
-            ax.plot(block_numbers, latency2[:len(percent_diff)], comparison_color, alpha=0.7, label=f'Comparison ({args.comparison_csv})')
+            ax.plot(block_numbers, plot_latency1, 'orange', alpha=0.7, label=f'Baseline ({args.baseline_csv})')
+            ax.plot(block_numbers, plot_latency2, comparison_color, alpha=0.7, label=f'Comparison ({args.comparison_csv})')
             ax.set_xlabel('Block Number')
             ax.set_ylabel('Total Latency (ms)')
-            ax.set_title('Total Latency vs Block Number')
+            title = 'Total Latency vs Block Number'
+            if args.average:
+                title += f' ({args.average}-block moving average)'
+            ax.set_title(title)
             ax.grid(True, alpha=0.3)
             ax.legend()
         else:
             # If no block_number column, use index
             indices = np.arange(len(percent_diff))
-            ax.plot(indices, latency1[:len(percent_diff)], 'orange', alpha=0.7, label=f'Baseline ({args.baseline_csv})')
-            ax.plot(indices, latency2[:len(percent_diff)], comparison_color, alpha=0.7, label=f'Comparison ({args.comparison_csv})')
+            ax.plot(indices, plot_latency1, 'orange', alpha=0.7, label=f'Baseline ({args.baseline_csv})')
+            ax.plot(indices, plot_latency2, comparison_color, alpha=0.7, label=f'Comparison ({args.comparison_csv})')
             ax.set_xlabel('Block Index')
             ax.set_ylabel('Total Latency (ms)')
-            ax.set_title('Total Latency vs Block Index')
+            title = 'Total Latency vs Block Index'
+            if args.average:
+                title += f' ({args.average}-block moving average)'
+            ax.set_title(title)
             ax.grid(True, alpha=0.3)
             ax.legend()
-        plot_idx += 1
+        
+        if args.separate:
+            plt.tight_layout()
+            output_file = get_output_filename(args.output, 'line')
+            plt.savefig(output_file, dpi=300, bbox_inches='tight')
+            output_files.append(output_file)
+            plt.close(fig)
+        else:
+            plot_idx += 1
 
-    plt.tight_layout()
-    plt.savefig(args.output, dpi=300, bbox_inches='tight')
+    # Plot gas/s graph if selected
+    if 'gas' in selected_graphs:
+        if args.separate:
+            fig, ax = plt.subplots(1, 1, figsize=(12, 6))
+        else:
+            ax = axes[plot_idx]
+            
+        # Calculate gas per second (gas/s)
+        # latency is in microseconds, so convert to seconds for gas/s calculation
+        gas1 = df1['gas_used'].values[:len(percent_diff)]
+        gas2 = df2['gas_used'].values[:len(percent_diff)]
+        
+        # Convert latency from microseconds to seconds
+        latency1_sec = df1['total_latency'].values[:len(percent_diff)] / 1_000_000.0
+        latency2_sec = df2['total_latency'].values[:len(percent_diff)] / 1_000_000.0
+        
+        # Calculate gas per second
+        gas_per_sec1 = gas1 / latency1_sec
+        gas_per_sec2 = gas2 / latency2_sec
+        
+        # Store original values for statistics before averaging
+        original_gas_per_sec1 = gas_per_sec1.copy()
+        original_gas_per_sec2 = gas_per_sec2.copy()
+        
+        # Apply moving average if requested
+        if args.average:
+            gas_per_sec1 = moving_average(gas_per_sec1, args.average)
+            gas_per_sec2 = moving_average(gas_per_sec2, args.average)
+        
+        # Calculate median gas/s for color determination (use original values)
+        median_gas_per_sec1 = np.median(original_gas_per_sec1)
+        median_gas_per_sec2 = np.median(original_gas_per_sec2)
+        comparison_color = 'green' if median_gas_per_sec2 > median_gas_per_sec1 else 'red'
+        
+        if 'block_number' in df1.columns and 'block_number' in df2.columns:
+            block_numbers = df1['block_number'].values[:len(percent_diff)]
+            ax.plot(block_numbers, gas_per_sec1, 'orange', alpha=0.7, label=f'Baseline ({args.baseline_csv})')
+            ax.plot(block_numbers, gas_per_sec2, comparison_color, alpha=0.7, label=f'Comparison ({args.comparison_csv})')
+            ax.set_xlabel('Block Number')
+            ax.set_ylabel('Gas Throughput')
+            title = 'Gas Throughput vs Block Number'
+            if args.average:
+                title += f' ({args.average}-block moving average)'
+            ax.set_title(title)
+            ax.grid(True, alpha=0.3)
+            ax.legend()
+            
+            # Format Y-axis with gas units
+            formatter = FuncFormatter(format_gas_units)
+            ax.yaxis.set_major_formatter(formatter)
+        else:
+            # If no block_number column, use index
+            indices = np.arange(len(percent_diff))
+            ax.plot(indices, gas_per_sec1, 'orange', alpha=0.7, label=f'Baseline ({args.baseline_csv})')
+            ax.plot(indices, gas_per_sec2, comparison_color, alpha=0.7, label=f'Comparison ({args.comparison_csv})')
+            ax.set_xlabel('Block Index')
+            ax.set_ylabel('Gas Throughput')
+            title = 'Gas Throughput vs Block Index'
+            if args.average:
+                title += f' ({args.average}-block moving average)'
+            ax.set_title(title)
+            ax.grid(True, alpha=0.3)
+            ax.legend()
+            
+            # Format Y-axis with gas units
+            formatter = FuncFormatter(format_gas_units)
+            ax.yaxis.set_major_formatter(formatter)
+        
+        if args.separate:
+            plt.tight_layout()
+            output_file = get_output_filename(args.output, 'gas')
+            plt.savefig(output_file, dpi=300, bbox_inches='tight')
+            output_files.append(output_file)
+            plt.close(fig)
+        else:
+            plot_idx += 1
+
+    # Save combined figure if not using separate files
+    if not args.separate:
+        plt.tight_layout()
+        plt.savefig(args.output, dpi=300, bbox_inches='tight')
+        output_files.append(args.output)
 
     # Create graph type description for output message
     graph_types = []
     if 'histogram' in selected_graphs:
         graph_types.append('histogram')
     if 'line' in selected_graphs:
         graph_types.append('latency graph')
+    if 'gas' in selected_graphs:
+        graph_types.append('gas/s graph')
     graph_desc = ' and '.join(graph_types)
-    print(f"{graph_desc.capitalize()} saved to {args.output}")
+    
+    # Print output file(s) information
+    if args.separate:
+        print(f"Saved {len(output_files)} separate files:")
+        for output_file in output_files:
+            print(f"  - {output_file}")
+    else:
+        print(f"{graph_desc.capitalize()} saved to {args.output}")
 
     # Always print statistics
     print(f"\nStatistics:")
@@ -170,6 +370,15 @@ def main():
     print(f"Min: {percent_diff.min():.2f}%")
     print(f"Max: {percent_diff.max():.2f}%")
     print(f"Total blocks analyzed: {len(percent_diff)}")
+    
+    # Print gas/s statistics if gas data is available
+    if 'gas' in selected_graphs:
+        # Use original values for statistics (not averaged)
+        print(f"\nGas/s Statistics:")
+        print(f"Baseline median gas/s: {median_gas_per_sec1:,.0f}")
+        print(f"Comparison median gas/s: {median_gas_per_sec2:,.0f}")
+        gas_diff_percent = ((median_gas_per_sec2 - median_gas_per_sec1) / median_gas_per_sec1) * 100
+        print(f"Gas/s percent change: {gas_diff_percent:+.2f}%")
 
 if __name__ == '__main__':
     main()

Original file line number	Diff line number	Diff line change
`@@ -63,3 +63,8 @@ recipe.json`
`63`	`63`	`_`
`64`	`64`	`# broken links report`
`65`	`65`	`links-report.json`
	`66`	`+`
	`67`	`+# Python cache`
	`68`	`+__pycache__/`
	`69`	`+*.py[cod]`
	`70`	`+*$py.class`