Skip to content

Commit 31a047f

Browse files
authored
Profiling Infrastructure (#354)
* Simplify run_single_evaluation This de-duplicates some duplicated code paths. This makes it easier to patch profiling calls into the function later on. * add gpu runtime info to SystemInfo This way we can tell whether we are using CUDA or ROCm later on. This also fixes the ROCm fallback path. * add 'link' report result type This will be used to communicate external download links such as profiling results. * add profiling data infrastructure A new ProfileResult type is added to run_eval, which is is returned in the EvalResult type. Among other fields, this contains the `download_url` field which should be used by the user to download profiling data. Note that the actual public download link may not be known in run_eval.py. In this case, it is the intention that the launcher fixes up the `download_url` before returning the results back to libkernelbot. * github launcher: separate artifact downloading from indexing The new function `GitHubRun.get_artifact_index` returns a dict of artifacts available from the run. For each artifact, the GitHub API URL and public download URL are returned. The latter is not available directly from the GitHub API, however, it can be easily constructed from the data that is available in the worflow result. `download_artifacts` is replaced by a function which downloads a specific artifact rather than all of them. Additionally, the function no longer writes to a temp file when downloading the artifact; the results of the download request can be piped directly into zipfile using BytesIO. * github runner: yield 'profile_data/*' from job as profile data The idea is that eval_run.py places profiling data in the profile_data/ directory, which is then automatically exported to the user. This is done by uploading that directory as the 'profile-data' artifact, then fetching its public download link and returning that as the ProfileResult.download_url.
1 parent 940b339 commit 31a047f

File tree

10 files changed

+214
-81
lines changed

10 files changed

+214
-81
lines changed

.github/workflows/amd_workflow.yml

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,13 @@ jobs:
3535
run: |
3636
# Extract the payload content without printing it
3737
PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH)
38-
38+
3939
# Apply mask to the extracted content
4040
echo "::add-mask::$PAYLOAD"
41-
41+
4242
# Now write to file (won't be logged since it's masked)
4343
echo "$PAYLOAD" > payload.json
44-
44+
4545
- name: Set venv directory based on runner
4646
run: |
4747
if [[ "${{ github.event.inputs.runner }}" == "amdgpu-mi250-x86-64" ]]; then
@@ -77,5 +77,12 @@ jobs:
7777
if: always()
7878
with:
7979
name: run-result
80-
path: |
81-
result.json
80+
path: result.json
81+
82+
- name: Upload profiling artifacts
83+
uses: actions/upload-artifact@v4
84+
if: always()
85+
with:
86+
name: profile-data
87+
path: profile_data/*
88+
retention-days: 1

.github/workflows/nvidia_workflow.yml

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,10 @@ jobs:
4242
# Extract the payload content without printing it
4343
apt-get update && apt-get install -y jq
4444
PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH)
45-
45+
4646
# Apply mask to the extracted content
4747
echo "::add-mask::$PAYLOAD"
48-
48+
4949
# Now write to file (won't be logged since it's masked)
5050
echo "$PAYLOAD" > payload.json
5151
@@ -73,15 +73,20 @@ jobs:
7373
shell: bash
7474
run: |
7575
python src/runners/github-runner.py
76-
cat result.json # Debug: show output
7776
7877
- name: Upload training artifacts
7978
uses: actions/upload-artifact@v4
8079
if: always()
8180
with:
8281
name: run-result
83-
path: |
84-
result.json
82+
path: result.json
8583

84+
- name: Upload profiling artifacts
85+
uses: actions/upload-artifact@v4
86+
if: always()
87+
with:
88+
name: profile-data
89+
path: profile_data/*
90+
retention-days: 1
8691
env:
8792
CUDA_VISIBLE_DEVICES: 0

scripts/ci_test_cuda.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import pytest
55

66
from libkernelbot.consts import ExitCode, SubmissionMode
7-
from libkernelbot.run_eval import compile_cuda_script, run_cuda_script
7+
from libkernelbot.run_eval import compile_cuda_script, make_system_info, run_cuda_script
88

99
ref = Path("examples/identity_cuda/reference.cuh").read_text()
1010
task_h = Path("examples/identity_cuda/task.h").read_text()
@@ -19,6 +19,7 @@ def run_cuda_helper(sources: dict, headers: dict = None, arch=None, **kwargs):
1919
headers = header_files
2020

2121
eval_result = run_cuda_script(
22+
make_system_info(),
2223
sources,
2324
headers,
2425
arch=arch,
@@ -194,6 +195,7 @@ def test_include_dirs(tmp_path: Path):
194195

195196
# can also use generic flags argument
196197
result = run_cuda_script(
198+
make_system_info(),
197199
{"eval.cu": eval_cu, "submission.cu": sub},
198200
header_files,
199201
flags=["-I.", f"-I{tmp_path}"],

scripts/ci_test_python.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from pathlib import Path
22

33
from libkernelbot.consts import ExitCode, SubmissionMode
4-
from libkernelbot.run_eval import run_pytorch_script
4+
from libkernelbot.run_eval import make_system_info, run_pytorch_script
55

66
ref = Path("examples/identity_py/reference.py").read_text()
77
task = Path("examples/identity_py/task.py").read_text()
@@ -12,6 +12,7 @@
1212

1313
def run_pytorch_helper(sources: dict, tests=None, **kwargs):
1414
result = run_pytorch_script(
15+
make_system_info(),
1516
sources,
1617
"eval.py",
1718
mode=SubmissionMode.TEST.value,

src/kernelbot/discord_reporter.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from discord_utils import _send_split_log
33

44
from libkernelbot.report import (
5+
Link,
56
Log,
67
MultiProgressReporter,
78
RunProgressReporter,
@@ -69,6 +70,11 @@ async def display_report(self, title: str, report: RunResultReport):
6970
message += part.text
7071
elif isinstance(part, Log):
7172
message = await _send_split_log(thread, message, part.header, part.content)
73+
elif isinstance(part, Link):
74+
if len(message) > 0:
75+
await thread.send(message)
76+
message = ""
77+
await thread.send(f"{part.title}: [{part.text}]({part.url})")
7278

7379
if len(message) > 0:
7480
await thread.send(message)

src/libkernelbot/launchers/github.py

Lines changed: 71 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
import asyncio
22
import base64
3+
import dataclasses
34
import datetime
5+
import io
46
import json
57
import math
68
import pprint
7-
import tempfile
89
import uuid
910
import zipfile
1011
import zlib
@@ -23,7 +24,14 @@
2324
SubmissionMode,
2425
)
2526
from libkernelbot.report import RunProgressReporter
26-
from libkernelbot.run_eval import CompileResult, EvalResult, FullResult, RunResult, SystemInfo
27+
from libkernelbot.run_eval import (
28+
CompileResult,
29+
EvalResult,
30+
FullResult,
31+
ProfileResult,
32+
RunResult,
33+
SystemInfo,
34+
)
2735
from libkernelbot.utils import setup_logging
2836

2937
from .launcher import Launcher
@@ -49,7 +57,7 @@ def __init__(self, repo: str, token: str, branch: str):
4957
self.token = token
5058
self.branch = branch
5159

52-
async def run_submission(
60+
async def run_submission( # noqa: C901
5361
self, config: dict, gpu_type: GPU, status: RunProgressReporter
5462
) -> FullResult:
5563
gpu_vendor = None
@@ -106,15 +114,17 @@ async def run_submission(
106114
await status.push("Downloading artifacts...")
107115
logger.info("Downloading artifacts...")
108116

109-
artifacts = await run.download_artifacts()
110-
if "run-result" not in artifacts:
111-
logger.error("Could not find `run-result` among artifacts: %s", artifacts.keys())
117+
index = run.get_artifact_index()
118+
119+
if "run-result" not in index:
120+
logger.error("Could not find `run-result` among artifacts: %s", index.keys())
112121
await status.push("Downloading artifacts... failed")
113122
return FullResult(
114123
success=False, error="Could not download artifacts", runs={}, system=SystemInfo()
115124
)
116125

117-
logs = artifacts["run-result"]["result.json"].decode("utf-8")
126+
artifact = await run.download_artifact(index["run-result"])
127+
logs = artifact["result.json"].decode("utf-8")
118128

119129
await status.update("Downloading artifacts... done")
120130
logger.info("Downloading artifacts... done")
@@ -123,17 +133,24 @@ async def run_submission(
123133
runs = {}
124134
# convert json back to EvalResult structures, which requires
125135
# special handling for datetime and our dataclasses.
136+
126137
for k, v in data["runs"].items():
127-
if "compilation" in v and v["compilation"] is not None:
128-
comp = CompileResult(**v["compilation"])
129-
else:
130-
comp = None
131-
run = RunResult(**v["run"])
138+
comp_res = None if v.get("compilation") is None else CompileResult(**v["compilation"])
139+
run_res = None if v.get("run") is None else RunResult(**v["run"])
140+
profile_res = None if v.get("profile") is None else ProfileResult(**v["profile"])
141+
142+
# Update profile artifact to the actual download URL.
143+
# For the GitHub launcher the profile_artifact currently just contains
144+
# the name of the artifact.
145+
if profile_res is not None:
146+
profile_res.download_url = index["profile-data"].public_download_url
147+
132148
res = EvalResult(
133149
start=datetime.datetime.fromisoformat(v["start"]),
134150
end=datetime.datetime.fromisoformat(v["end"]),
135-
compilation=comp,
136-
run=run,
151+
compilation=comp_res,
152+
run=run_res,
153+
profile=profile_res,
137154
)
138155
runs[k] = res
139156

@@ -147,6 +164,13 @@ async def wait_callback(self, run: "GitHubRun", status: RunProgressReporter):
147164
)
148165

149166

167+
@dataclasses.dataclass
168+
class GitHubArtifact:
169+
name: str
170+
archive_download_url: str
171+
public_download_url: str
172+
173+
150174
class GitHubRun:
151175
def __init__(self, repo: str, token: str, branch: str, workflow_file: str):
152176
gh = Github(token)
@@ -323,34 +347,43 @@ async def wait_for_completion(
323347
logger.error(f"Error waiting for GitHub run {self.run_id}: {e}", exc_info=e)
324348
raise # Re-raise other exceptions
325349

326-
async def download_artifacts(self) -> dict:
327-
logger.info("Attempting to download artifacts for run %s", self.run_id)
350+
351+
def get_artifact_index(self) -> dict[str, GitHubArtifact]:
352+
logger.info("Creating artifact index for run %s", self.run_id)
328353
artifacts = self.run.get_artifacts()
329354

330355
extracted = {}
331356

332357
for artifact in artifacts:
333-
url = artifact.archive_download_url
334-
headers = {"Authorization": f"token {self.token}"}
335-
response = requests.get(url, headers=headers)
336-
337-
if response.status_code == 200:
338-
with tempfile.NamedTemporaryFile("w+b") as temp:
339-
temp.write(response.content)
340-
temp.flush()
341-
342-
with zipfile.ZipFile(temp.name) as z:
343-
artifact_dict = {}
344-
for file in z.namelist():
345-
with z.open(file) as f:
346-
artifact_dict[file] = f.read()
347-
348-
extracted[artifact.name] = artifact_dict
349-
else:
350-
raise RuntimeError(
351-
f"Failed to download artifact {artifact.name}. "
352-
f"Status code: {response.status_code}"
353-
)
358+
extracted[artifact.name] = GitHubArtifact(
359+
name=artifact.name,
360+
archive_download_url=artifact.archive_download_url,
361+
# Non-machine users cannot download from the archive_download_url and
362+
# the GitHub API does not give us access to the public download url.
363+
public_download_url=f"{self.repo.html_url}/actions/runs/{self.run_id}/artifacts/{artifact.id}",
364+
)
354365

355-
logger.info("Download artifacts for run %s: %s", self.run_id, list(extracted.keys()))
356366
return extracted
367+
368+
369+
async def download_artifact(self, artifact: GitHubArtifact) -> dict:
370+
logger.info("Attempting to download artifact '%s' for run %s", artifact.name, self.run_id)
371+
372+
url = artifact.archive_download_url
373+
headers = {"Authorization": f"token {self.token}"}
374+
response = requests.get(url, headers=headers)
375+
376+
if response.status_code == 200:
377+
artifact_dict = {}
378+
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
379+
for file in z.namelist():
380+
with z.open(file) as f:
381+
artifact_dict[file] = f.read()
382+
383+
logger.info("Downloaded artifact '%s' for run %s", artifact.name, self.run_id)
384+
return artifact_dict
385+
else:
386+
raise RuntimeError(
387+
f"Failed to download artifact {artifact.name}. "
388+
f"Status code: {response.status_code}"
389+
)

src/libkernelbot/report.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,16 +32,30 @@ class Log:
3232
content: str
3333

3434

35+
@dataclasses.dataclass
36+
class Link:
37+
"""
38+
Link represents a link in the profiling report, to result data
39+
which can be downloaded by clicking it.
40+
"""
41+
title: str
42+
text: str
43+
url: str
44+
45+
3546
class RunResultReport:
3647
def __init__(self, data=None):
37-
self.data: List[Text | Log] = data or []
48+
self.data: List[Text | Log | Link] = data or []
3849

3950
def add_text(self, section: str):
4051
self.data.append(Text(section))
4152

4253
def add_log(self, header: str, log: str):
4354
self.data.append(Log(header, log))
4455

56+
def add_link(self, title: str, text: str, url: str):
57+
self.data.append(Link(title, text, url))
58+
4559
def __repr__(self):
4660
return f"RunResultReport(data={self.data})"
4761

@@ -267,6 +281,7 @@ def generate_system_info(system: SystemInfo):
267281
Running on:
268282
* GPU: `{system.gpu}`
269283
* CPU: `{system.cpu}`
284+
* Runtime: `{system.runtime}`
270285
* Platform: `{system.platform}`
271286
* Torch: `{system.torch}`
272287
"""
@@ -322,6 +337,13 @@ def generate_report(result: FullResult) -> RunResultReport: # noqa: C901
322337
make_profile_log(prof_run.run),
323338
)
324339

340+
if prof_run.profile is not None and prof_run.profile.download_url is not None:
341+
report.add_link(
342+
f"{prof_run.profile.profiler} profiling output",
343+
"Download from GitHub",
344+
prof_run.profile.download_url,
345+
)
346+
325347
if "leaderboard" in runs:
326348
bench_run = runs["leaderboard"]
327349
if _handle_crash_report(report, bench_run):

0 commit comments

Comments
 (0)