1818import json
1919import logging
2020import os
21+ import resource
2122import time
2223import timeit
2324from dataclasses import dataclass , fields , is_dataclass , MISSING
@@ -108,14 +109,14 @@ class CompileMode(Enum):
108109
109110
110111@dataclass
111- class MemoryStats :
112+ class GPUMemoryStats :
112113 rank : int
113114 malloc_retries : int
114115 max_mem_allocated_mbs : int
115116 max_mem_reserved_mbs : int
116117
117118 @classmethod
118- def for_device (cls , rank : int ) -> "MemoryStats " :
119+ def for_device (cls , rank : int ) -> "GPUMemoryStats " :
119120 stats = torch .cuda .memory_stats (rank )
120121 alloc_retries = stats .get ("num_alloc_retries" , 0 )
121122 max_allocated = stats .get ("allocated_bytes.all.peak" , 0 )
@@ -131,13 +132,31 @@ def __str__(self) -> str:
131132 return f"Rank { self .rank } : retries={ self .malloc_retries } , allocated={ self .max_mem_allocated_mbs :7} mb, reserved={ self .max_mem_reserved_mbs :7} mb"
132133
133134
135+ @dataclass
136+ class CPUMemoryStats :
137+ rank : int
138+ peak_rss_mbs : int
139+
140+ @classmethod
141+ def for_process (cls , rank : int ) -> "CPUMemoryStats" :
142+ # Peak RSS from resource.getrusage (in KB on CentOS/Linux)
143+ peak_rss_kb = resource .getrusage (resource .RUSAGE_SELF ).ru_maxrss
144+ peak_rss_mb = peak_rss_kb // 1024
145+
146+ return cls (rank , peak_rss_mb )
147+
148+ def __str__ (self ) -> str :
149+ return f"Rank { self .rank } : CPU Memory Peak RSS: { self .peak_rss_mbs / 1000 :.2f} GB"
150+
151+
134152@dataclass
135153class BenchmarkResult :
136154 "Class for holding results of benchmark runs"
137155 short_name : str
138156 gpu_elapsed_time : torch .Tensor # milliseconds
139157 cpu_elapsed_time : torch .Tensor # milliseconds
140- mem_stats : List [MemoryStats ] # memory stats per rank
158+ gpu_mem_stats : List [GPUMemoryStats ] # GPU memory stats per rank
159+ cpu_mem_stats : List [CPUMemoryStats ] # CPU memory stats per rank
141160 rank : int = - 1
142161
143162 def __str__ (self ) -> str :
@@ -147,14 +166,16 @@ def __str__(self) -> str:
147166 cpu_runtime = (
148167 f"CPU Runtime (P90): { self .runtime_percentile (90 , device = 'cpu' ):.2f} ms"
149168 )
150- if len (self .mem_stats ) == 0 :
151- return f"{ self .short_name : <{35 }} | { gpu_runtime } | { cpu_runtime } "
152- mem_alloc = (
153- f"Peak Memory alloc (P90): { self .max_mem_alloc_percentile (90 )/ 1000 :.2f} GB"
154- )
155- mem_reserved = f"Peak Memory reserved (P90): { self .max_mem_reserved_percentile (90 )/ 1000 :.2f} GB"
169+ cpu_mem = f"CPU Peak RSS (P90): { self .cpu_mem_percentile (90 )/ 1000 :.2f} GB"
170+
171+ if len (self .gpu_mem_stats ) == 0 :
172+ return (
173+ f"{ self .short_name : <{35 }} | { gpu_runtime } | { cpu_runtime } | { cpu_mem } "
174+ )
175+ mem_alloc = f"GPU Peak Memory alloc (P90): { self .max_mem_alloc_percentile (90 )/ 1000 :.2f} GB"
176+ mem_reserved = f"GPU Peak Memory reserved (P90): { self .max_mem_reserved_percentile (90 )/ 1000 :.2f} GB"
156177 malloc_retries = f"Malloc retries (P50/P90/P100): { self .mem_retries (50 )} / { self .mem_retries (90 )} / { self .mem_retries (100 )} "
157- return f"{ self .short_name : <{35 }} | { malloc_retries } | { gpu_runtime } | { cpu_runtime } | { mem_alloc } | { mem_reserved } "
178+ return f"{ self .short_name : <{35 }} | { malloc_retries } | { gpu_runtime } | { cpu_runtime } | { mem_alloc } | { mem_reserved } | { cpu_mem } "
158179
159180 def runtime_percentile (
160181 self ,
@@ -199,15 +220,28 @@ def mem_retries(
199220
200221 def _mem_percentile (
201222 self ,
202- mem_selector : Callable [[MemoryStats ], int ],
223+ mem_selector : Callable [[GPUMemoryStats ], int ],
203224 percentile : int = 50 ,
204225 interpolation : str = "nearest" ,
205226 ) -> torch .Tensor :
206227 mem_data = torch .tensor (
207- [mem_selector (mem_stat ) for mem_stat in self .mem_stats ], dtype = torch .float
228+ [mem_selector (mem_stat ) for mem_stat in self .gpu_mem_stats ],
229+ dtype = torch .float ,
208230 )
209231 return torch .quantile (mem_data , percentile / 100.0 , interpolation = interpolation )
210232
233+ def cpu_mem_percentile (
234+ self , percentile : int = 50 , interpolation : str = "nearest"
235+ ) -> torch .Tensor :
236+ """Return the CPU memory percentile for peak RSS."""
237+ cpu_mem_data = torch .tensor (
238+ [cpu_stat .peak_rss_mbs for cpu_stat in self .cpu_mem_stats ],
239+ dtype = torch .float ,
240+ )
241+ return torch .quantile (
242+ cpu_mem_data , percentile / 100.0 , interpolation = interpolation
243+ )
244+
211245
212246class ECWrapper (torch .nn .Module ):
213247 """
@@ -437,8 +471,11 @@ def write_report(
437471 qps_gpu = int (num_requests / avg_dur_s_gpu )
438472
439473 mem_str = ""
440- for memory_stats in benchmark_res .mem_stats :
441- mem_str += f"{ memory_stats } \n "
474+ for gpu_memory_stats in benchmark_res .gpu_mem_stats :
475+ mem_str += f"{ gpu_memory_stats } \n "
476+
477+ for cpu_memory_stats in benchmark_res .cpu_mem_stats :
478+ mem_str += f"{ cpu_memory_stats } \n "
442479
443480 report_str += (
444481 f"{ benchmark_res .short_name :40} "
@@ -816,13 +853,16 @@ def _run_benchmark_core(
816853 gpu_elapsed_time = cpu_elapsed_time .clone ()
817854
818855 # Memory statistics collection
819- mem_stats : List [MemoryStats ] = []
856+ gpu_mem_stats : List [GPUMemoryStats ] = []
857+ cpu_mem_stats = [CPUMemoryStats .for_process (rank )]
858+
820859 if device_type == "cuda" :
821860 if rank == - 1 :
822861 for di in range (world_size ):
823- mem_stats .append (MemoryStats .for_device (di ))
862+ gpu_mem_stats .append (GPUMemoryStats .for_device (di ))
824863 else :
825- mem_stats .append (MemoryStats .for_device (rank ))
864+ gpu_mem_stats .append (GPUMemoryStats .for_device (rank ))
865+ # CPU memory stats are collected for both GPU and CPU-only runs
826866
827867 # Optional detailed profiling
828868 if output_dir and profile_iter_fn and device_type == "cuda" :
@@ -868,7 +908,8 @@ def _trace_handler(prof: torch.profiler.profile) -> None:
868908 short_name = name ,
869909 gpu_elapsed_time = gpu_elapsed_time ,
870910 cpu_elapsed_time = cpu_elapsed_time ,
871- mem_stats = mem_stats ,
911+ gpu_mem_stats = gpu_mem_stats ,
912+ cpu_mem_stats = cpu_mem_stats ,
872913 rank = rank ,
873914 )
874915
@@ -1139,7 +1180,8 @@ def setUp() -> None:
11391180 res = qq .get ()
11401181
11411182 benchmark_res_per_rank .append (res )
1142- assert len (res .mem_stats ) == 1
1183+ assert len (res .gpu_mem_stats ) == 1
1184+ assert len (res .cpu_mem_stats ) == 1
11431185
11441186 for p in processes :
11451187 p .join ()
@@ -1149,13 +1191,15 @@ def setUp() -> None:
11491191 short_name = benchmark_res_per_rank [0 ].short_name ,
11501192 gpu_elapsed_time = benchmark_res_per_rank [0 ].gpu_elapsed_time ,
11511193 cpu_elapsed_time = benchmark_res_per_rank [0 ].cpu_elapsed_time ,
1152- mem_stats = [MemoryStats (rank , 0 , 0 , 0 ) for rank in range (world_size )],
1194+ gpu_mem_stats = [GPUMemoryStats (rank , 0 , 0 , 0 ) for rank in range (world_size )],
1195+ cpu_mem_stats = [CPUMemoryStats (rank , 0 ) for rank in range (world_size )],
11531196 rank = 0 ,
11541197 )
11551198
11561199 for res in benchmark_res_per_rank :
1157- # Each rank's BenchmarkResult contains 1 memory measurement
1158- total_benchmark_res .mem_stats [res .rank ] = res .mem_stats [0 ]
1200+ # Each rank's BenchmarkResult contains 1 GPU and 1 CPU memory measurement
1201+ total_benchmark_res .gpu_mem_stats [res .rank ] = res .gpu_mem_stats [0 ]
1202+ total_benchmark_res .cpu_mem_stats [res .rank ] = res .cpu_mem_stats [0 ]
11591203
11601204 return total_benchmark_res
11611205
0 commit comments