@@ -49,12 +49,12 @@ def __init__(self, labels: List[str], max_model_len: int):
4949 description = "Number of generation tokens processed." ,
5050 kind = pb_utils .MetricFamily .COUNTER ,
5151 )
52- self .counter_preemption_tokens_family = pb_utils .MetricFamily (
52+ self .counter_num_preemption_family = pb_utils .MetricFamily (
5353 name = "vllm:num_preemptions_total" ,
5454 description = "Number of preemption tokens processed." ,
5555 kind = pb_utils .MetricFamily .COUNTER ,
5656 )
57- self .histogram_iteration_tokens_total_family = pb_utils .MetricFamily (
57+ self .histogram_iteration_tokens_family = pb_utils .MetricFamily (
5858 name = "vllm:iteration_tokens_total" ,
5959 description = "Histogram of number of tokens per engine_step." ,
6060 kind = pb_utils .MetricFamily .HISTOGRAM ,
@@ -124,33 +124,12 @@ def __init__(self, labels: List[str], max_model_len: int):
124124 description = "Number of requests waiting to be processed." ,
125125 kind = pb_utils .MetricFamily .GAUGE ,
126126 )
127- self .gauge_scheduler_swapped_family = pb_utils .MetricFamily (
128- name = "vllm:num_requests_swapped" ,
129- description = "Number of requests swapped to CPU." ,
130- kind = pb_utils .MetricFamily .GAUGE ,
131- )
132127 # KV Cache Usage in %
133128 self .gauge_gpu_cache_usage_family = pb_utils .MetricFamily (
134129 name = "vllm:gpu_cache_usage_perc" ,
135130 description = "GPU KV-cache usage. 1 means 100 percent usage." ,
136131 kind = pb_utils .MetricFamily .GAUGE ,
137132 )
138- self .gauge_cpu_cache_usage_family = pb_utils .MetricFamily (
139- name = "vllm:cpu_cache_usage_perc" ,
140- description = "CPU KV-cache usage. 1 means 100 percent usage." ,
141- kind = pb_utils .MetricFamily .GAUGE ,
142- )
143- # Prefix caching block hit rate
144- self .gauge_cpu_prefix_cache_hit_rate_family = pb_utils .MetricFamily (
145- name = "vllm:cpu_prefix_cache_hit_rate" ,
146- description = "CPU prefix cache block hit rate." ,
147- kind = pb_utils .MetricFamily .GAUGE ,
148- )
149- self .gauge_gpu_prefix_cache_hit_rate_family = pb_utils .MetricFamily (
150- name = "vllm:gpu_prefix_cache_hit_rate" ,
151- description = "GPU prefix cache block hit rate." ,
152- kind = pb_utils .MetricFamily .GAUGE ,
153- )
154133
155134 # Initialize metrics
156135 # Iteration stats
@@ -160,14 +139,14 @@ def __init__(self, labels: List[str], max_model_len: int):
160139 self .counter_generation_tokens = self .counter_generation_tokens_family .Metric (
161140 labels = labels
162141 )
163- self .counter_preemption_tokens = self .counter_preemption_tokens_family .Metric (
142+ self .counter_num_preemption = self .counter_num_preemption_family .Metric (
164143 labels = labels
165144 )
166145
167146 # Use the same bucket boundaries from vLLM sample metrics as an example.
168147 # https://github.com/vllm-project/vllm/blob/21313e09e3f9448817016290da20d0db1adf3664/vllm/engine/metrics.py#L81-L96
169- self .histogram_iteration_tokens_total = (
170- self .histogram_iteration_tokens_total_family .Metric (
148+ self .histogram_iteration_tokens = (
149+ self .histogram_iteration_tokens_family .Metric (
171150 labels = labels ,
172151 buckets = [1 , 8 , 16 , 32 , 64 , 128 , 256 , 512 , 1024 , 2048 , 4096 , 8192 , 16384 ],
173152 )
@@ -218,32 +197,36 @@ def __init__(self, labels: List[str], max_model_len: int):
218197 )
219198 # Request stats
220199 # Latency
200+ request_latency_buckets = [
201+ 0.3 , 0.5 , 0.8 , 1.0 , 1.5 , 2.0 , 2.5 , 5.0 , 10.0 , 15.0 , 20.0 , 30.0 ,
202+ 40.0 , 50.0 , 60.0 , 120.0 , 240.0 , 480.0 , 960.0 , 1920.0 , 7680.0
203+ ]
221204 self .histogram_e2e_time_request = self .histogram_e2e_time_request_family .Metric (
222205 labels = labels ,
223- buckets = [ 1.0 , 2.5 , 5.0 , 10.0 , 15.0 , 20.0 , 30.0 , 40.0 , 50.0 , 60.0 ] ,
206+ buckets = request_latency_buckets ,
224207 )
225208 self .histogram_prefill_time_request = (
226209 self .histogram_prefill_time_request_family .Metric (
227210 labels = labels ,
228- buckets = [ 1.0 , 2.5 , 5.0 , 10.0 , 15.0 , 20.0 , 30.0 , 40.0 , 50.0 , 60.0 ] ,
211+ buckets = request_latency_buckets ,
229212 )
230213 )
231214 self .histogram_decode_time_request = (
232215 self .histogram_decode_time_request_family .Metric (
233216 labels = labels ,
234- buckets = [ 1.0 , 2.5 , 5.0 , 10.0 , 15.0 , 20.0 , 30.0 , 40.0 , 50.0 , 60.0 ] ,
217+ buckets = request_latency_buckets ,
235218 )
236219 )
237220 self .histogram_inference_time_request = (
238221 self .histogram_inference_time_request_family .Metric (
239222 labels = labels ,
240- buckets = [ 1.0 , 2.5 , 5.0 , 10.0 , 15.0 , 20.0 , 30.0 , 40.0 , 50.0 , 60.0 ] ,
223+ buckets = request_latency_buckets ,
241224 )
242225 )
243226 self .histogram_queue_time_request = (
244227 self .histogram_queue_time_request_family .Metric (
245228 labels = labels ,
246- buckets = [ 1.0 , 2.5 , 5.0 , 10.0 , 15.0 , 20.0 , 30.0 , 40.0 , 50.0 , 60.0 ] ,
229+ buckets = request_latency_buckets ,
247230 )
248231 )
249232 # Metadata
@@ -265,29 +248,16 @@ def __init__(self, labels: List[str], max_model_len: int):
265248 )
266249 # System stats
267250 # Scheduler State
268- self .gauge_num_requests_running = self .gauge_scheduler_running_family .Metric (
251+ self .gauge_scheduler_running = self .gauge_scheduler_running_family .Metric (
269252 labels = labels
270253 )
271- self .gauge_num_requests_waiting = self .gauge_scheduler_waiting_family .Metric (
272- labels = labels
273- )
274- self .gauge_num_requests_swapped = self .gauge_scheduler_swapped_family .Metric (
254+ self .gauge_scheduler_waiting = self .gauge_scheduler_waiting_family .Metric (
275255 labels = labels
276256 )
277257 # KV Cache Usage in %
278258 self .gauge_gpu_cache_usage = self .gauge_gpu_cache_usage_family .Metric (
279259 labels = labels
280260 )
281- self .gauge_cpu_cache_usage = self .gauge_cpu_cache_usage_family .Metric (
282- labels = labels
283- )
284- # Prefix caching block hit rate
285- self .gauge_cpu_prefix_cache_hit_rate = (
286- self .gauge_cpu_prefix_cache_hit_rate_family .Metric (labels = labels )
287- )
288- self .gauge_gpu_prefix_cache_hit_rate = (
289- self .gauge_gpu_prefix_cache_hit_rate_family .Metric (labels = labels )
290- )
291261
292262
293263class VllmStatLogger (VllmStatLoggerBase ):
@@ -394,19 +364,9 @@ def log(self, stats: VllmStats) -> None:
394364 (self .metrics .histogram_n_request , stats .n_requests ),
395365 ]
396366 gauge_metrics = [
397- (self .metrics .gauge_num_requests_running , stats .num_running_sys ),
398- (self .metrics .gauge_num_requests_waiting , stats .num_waiting_sys ),
399- (self .metrics .gauge_num_requests_swapped , stats .num_swapped_sys ),
367+ (self .metrics .gauge_scheduler_running , stats .num_running_sys ),
368+ (self .metrics .gauge_scheduler_waiting , stats .num_waiting_sys ),
400369 (self .metrics .gauge_gpu_cache_usage , stats .gpu_cache_usage_sys ),
401- (self .metrics .gauge_cpu_cache_usage , stats .cpu_cache_usage_sys ),
402- (
403- self .metrics .gauge_cpu_prefix_cache_hit_rate ,
404- stats .cpu_prefix_cache_hit_rate ,
405- ),
406- (
407- self .metrics .gauge_gpu_prefix_cache_hit_rate ,
408- stats .gpu_prefix_cache_hit_rate ,
409- ),
410370 ]
411371 for metric , data in counter_metrics :
412372 self ._log_counter (metric , data )
0 commit comments