Skip to content

Commit 175acc0

Browse files
committed
vllm_queue_status
Signed-off-by: Huamin Li <[email protected]>
1 parent a287d59 commit 175acc0

File tree

5 files changed

+782
-0
lines changed

5 files changed

+782
-0
lines changed
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{
2+
"params": {
3+
"startTime": "DateTime64(3)",
4+
"stopTime": "DateTime64(3)"
5+
},
6+
"tests": [
7+
{
8+
"startTime": "2025-10-17T00:00:00.000",
9+
"stopTime": "2025-10-18T00:00:00.000"
10+
}
11+
]
12+
}
Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
/* Windowed per-build table (UTC), incl. PR & main builds, with queue totals, cost, and is_main_branch.
2+
WAIT: only attempts with started_at IS NOT NULL contribute wait (runnable → started).
3+
RUN: clip to [w_start, w_end]; 1-day zombie guard for open 'running' attempts.
4+
COST: 1.3232 * gpu_1_queue_run_hours + 4.602 * gpu_4_queue_run_hours (fixed).
5+
*/
6+
7+
WITH
8+
parseDateTime64BestEffort({startTime:String}, 3) AS w_start, -- inclusive (UTC)
9+
parseDateTime64BestEffort({stopTime:String}, 3) AS w_end, -- exclusive (UTC)
10+
toDateTime64(now(), 3) AS now64,
11+
(w_end - INTERVAL 1 DAY) AS zombie_cutoff,
12+
toDateTime64('2100-01-01 00:00:00', 3) AS FAR_FUTURE,
13+
['gpu_1_queue','gpu_4_queue'] AS QUEUES
14+
15+
/* 1) All builds created within the window (+ branch/PR context) */
16+
, builds_window AS (
17+
SELECT
18+
tupleElement(build,'id') AS build_id,
19+
20+
argMax(tupleElement(build,'number'), tupleElement(job,'created_at')) AS build_number,
21+
argMax(tupleElement(build,'web_url'), tupleElement(job,'created_at')) AS build_url,
22+
concat(argMax(tupleElement(build,'web_url'), tupleElement(job,'created_at')), '/steps/table') AS steps_table_url,
23+
argMax(tupleElement(build,'commit'), tupleElement(job,'created_at')) AS commit_sha,
24+
25+
/* robust start/finish (fallback to job min/max if build-level fields are NULL) */
26+
coalesce(argMax(tupleElement(build,'started_at'), tupleElement(job,'created_at')),
27+
min(tupleElement(job,'started_at'))) AS robust_start,
28+
coalesce(argMax(tupleElement(build,'finished_at'), tupleElement(job,'created_at')),
29+
max(tupleElement(job,'finished_at'))) AS robust_finish,
30+
31+
countDistinct(tupleElement(job,'id')) AS steps_count,
32+
argMax(tupleElement(build,'state'), tupleElement(job,'created_at')) AS latest_build_state,
33+
34+
/* repo + PR mapping (repo_slug may come from pipeline or PR repo) */
35+
coalesce(
36+
nullIf(extract(argMax(tupleElement(pipeline,'repository'), tupleElement(job,'created_at')), 'github\\.com[:/]+([^/]+/[^/.]+)'), ''),
37+
nullIf(extract(argMax(tupleElement(build,'pull_request').repository, tupleElement(job,'created_at')), 'github\\.com[:/]+([^/]+/[^/.]+)'), ''),
38+
nullIf(extract(argMax(tupleElement(build,'pull_request').repository, tupleElement(job,'created_at')), '([^/]+/[^/.]+)'), '')
39+
) AS repo_slug,
40+
coalesce(
41+
toInt64OrNull(argMax(tupleElement(build,'pull_request').id, tupleElement(job,'created_at'))),
42+
toInt64OrNull(extract(argMax(tupleElement(build,'branch'), tupleElement(job,'created_at')), 'pull/([0-9]+)'))
43+
) AS pr_number,
44+
45+
argMax(tupleElement(build,'created_at'), tupleElement(job,'created_at')) AS build_created_at_utc,
46+
argMax(tupleElement(build,'branch'), tupleElement(job,'created_at')) AS branch_name
47+
FROM vllm.vllm_buildkite_jobs
48+
GROUP BY tupleElement(build,'id')
49+
HAVING build_created_at_utc >= w_start AND build_created_at_utc < w_end
50+
)
51+
52+
/* 2) Agent-run attempts for those builds that can overlap the window */
53+
, base_agent AS (
54+
SELECT
55+
tupleElement(build,'id') AS build_id,
56+
tupleElement(job,'id') AS job_id,
57+
tupleElement(job,'created_at') AS created_at,
58+
tupleElement(job,'state') AS state,
59+
tupleElement(job,'runnable_at') AS runnable_at,
60+
tupleElement(job,'started_at') AS started_at,
61+
tupleElement(job,'finished_at') AS finished_at,
62+
replaceOne(arrayFirst(x -> startsWith(x,'queue='),
63+
tupleElement(job,'agent_query_rules')), 'queue=', '') AS queue_key
64+
FROM vllm.vllm_buildkite_jobs
65+
INNER JOIN builds_window b ON tupleElement(build,'id') = b.build_id
66+
WHERE tupleElement(job,'type') IN ('script','command')
67+
AND (
68+
tupleElement(job,'runnable_at') < w_end OR
69+
tupleElement(job,'started_at') < w_end OR
70+
ifNull(tupleElement(job,'finished_at'), FAR_FUTURE) >= w_start
71+
)
72+
)
73+
74+
/* 3) Collapse to (build_id, job_id) and collect attempts keyed by queue */
75+
, jobs_by_build AS (
76+
SELECT
77+
build_id,
78+
job_id,
79+
argMax(state, created_at) AS latest_state,
80+
max(created_at) AS last_event_at,
81+
82+
/* RUN attempts: (queue, start, finish) */
83+
arrayDistinct(arrayFilter(t -> t.2 IS NOT NULL,
84+
groupArray((queue_key, started_at, finished_at))
85+
)) AS run_triplets,
86+
87+
/* WAIT attempts: (queue, runnable, start) — ONLY attempts that actually started */
88+
arrayDistinct(arrayFilter(t -> t.2 IS NOT NULL AND t.3 IS NOT NULL,
89+
groupArray((queue_key, runnable_at, started_at))
90+
)) AS wait_triplets
91+
FROM base_agent
92+
GROUP BY build_id, job_id
93+
)
94+
95+
/* 4) RUN attempts → per build × queue (clip to window; zombie guard for open runs) */
96+
, runs_scored AS (
97+
SELECT
98+
build_id,
99+
tupleElement(rt, 1) AS queue_key,
100+
greatest(tupleElement(rt, 2), w_start) AS s_clip,
101+
least(
102+
ifNull(
103+
tupleElement(rt, 3),
104+
if(latest_state = 'running' AND last_event_at < zombie_cutoff,
105+
least(last_event_at + INTERVAL 1 MINUTE, w_end),
106+
w_end)
107+
),
108+
w_end
109+
) AS e_clip
110+
FROM jobs_by_build
111+
ARRAY JOIN run_triplets AS rt
112+
WHERE tupleElement(rt, 1) IN QUEUES
113+
)
114+
, run_by_build AS (
115+
SELECT
116+
build_id, queue_key,
117+
sumIf(dateDiff('second', s_clip, e_clip), e_clip > s_clip) AS total_run_s
118+
FROM runs_scored
119+
GROUP BY build_id, queue_key
120+
)
121+
122+
/* 5) WAIT attempts (runnable → started) → per build × queue (clip to window) */
123+
, waits_scored AS (
124+
SELECT
125+
build_id,
126+
tupleElement(wt, 1) AS queue_key,
127+
greatest(tupleElement(wt, 2), w_start) AS ra_clip,
128+
least(tupleElement(wt, 3), w_end) AS st_clip,
129+
greatest(0, dateDiff('second', greatest(tupleElement(wt, 2), w_start), least(tupleElement(wt, 3), w_end))) AS wait_s
130+
FROM jobs_by_build
131+
ARRAY JOIN wait_triplets AS wt
132+
WHERE tupleElement(wt, 1) IN QUEUES
133+
)
134+
, waits_p90_pivot AS (
135+
SELECT
136+
build_id,
137+
/* P90 per queue (approximate quantile; broadly supported) */
138+
quantileIf(0.9)(toFloat64(wait_s), queue_key = 'gpu_1_queue') AS gpu1_p90_s,
139+
quantileIf(0.9)(toFloat64(wait_s), queue_key = 'gpu_4_queue') AS gpu4_p90_s,
140+
/* Combined P90 across both queues */
141+
quantile(0.9)(toFloat64(wait_s)) AS p90_combined_s
142+
FROM waits_scored
143+
WHERE wait_s > 0
144+
GROUP BY build_id
145+
)
146+
147+
/* 6) Pivot per-build totals to hour columns */
148+
, run_totals_by_build AS (
149+
SELECT
150+
build_id,
151+
round(sumIf(total_run_s, queue_key = 'gpu_1_queue') / 3600.0, 2) AS gpu_1_queue_run_hours,
152+
round(sumIf(total_run_s, queue_key = 'gpu_4_queue') / 3600.0, 2) AS gpu_4_queue_run_hours
153+
FROM run_by_build
154+
GROUP BY build_id
155+
)
156+
157+
/* 7) Final table (UTC) — includes both PR and main builds */
158+
SELECT
159+
/* PR URL (NULL for non-PR builds) */
160+
if((b.pr_number IS NULL) OR (b.repo_slug IS NULL),
161+
NULL,
162+
concat('https://github.com/', b.repo_slug, '/pull/', toString(b.pr_number))
163+
) AS pr_url,
164+
165+
b.build_number AS build_number,
166+
b.build_id AS build_id,
167+
b.build_url AS build_url,
168+
b.steps_table_url AS steps_table_url,
169+
b.commit_sha AS commit_sha,
170+
171+
b.robust_start AS build_started_at,
172+
b.robust_finish AS build_finished_at,
173+
174+
/* duration (hours) = finish − start (UTC) */
175+
multiIf(
176+
b.robust_start IS NULL OR b.robust_finish IS NULL,
177+
NULL,
178+
round(dateDiff('second', b.robust_start, b.robust_finish) / 3600.0, 2)
179+
) AS duration_hours,
180+
181+
b.steps_count AS steps_count,
182+
b.latest_build_state AS latest_build_state,
183+
184+
/* Keep run hours for cost */
185+
ifNull(rt.gpu_1_queue_run_hours, 0) AS gpu_1_queue_run_hours,
186+
ifNull(rt.gpu_4_queue_run_hours, 0) AS gpu_4_queue_run_hours,
187+
188+
/* NEW: P90 wait hours (by queue + combined) */
189+
round(ifNull(wp.gpu1_p90_s, 0) / 3600.0, 2) AS gpu_1_queue_wait_p90_hours,
190+
round(ifNull(wp.gpu4_p90_s, 0) / 3600.0, 2) AS gpu_4_queue_wait_p90_hours,
191+
round(ifNull(wp.p90_combined_s, 0) / 3600.0, 2) AS wait_p90_hours,
192+
193+
/* Fixed-rate cost */
194+
round(
195+
1.3232 * ifNull(rt.gpu_1_queue_run_hours, 0) +
196+
4.602 * ifNull(rt.gpu_4_queue_run_hours, 0),
197+
2
198+
) AS cost,
199+
200+
/* Mark if the build branch is literally 'main' */
201+
toUInt8(b.branch_name = 'main') AS is_main_branch
202+
203+
FROM builds_window AS b
204+
LEFT JOIN run_totals_by_build AS rt ON rt.build_id = b.build_id
205+
LEFT JOIN waits_p90_pivot AS wp ON wp.build_id = b.build_id
206+
ORDER BY b.build_created_at_utc ASC;

0 commit comments

Comments
 (0)