Profiling Infrastructure (#354)

Snektron · web-flow · commit 31a047f1ff56 · 2025-09-10T08:46:23.000+02:00
* Simplify run_single_evaluation

This de-duplicates some duplicated code paths. This makes it easier to
patch profiling calls into the function later on.

* add gpu runtime info to SystemInfo

This way we can tell whether we are using CUDA or ROCm later on.

This also fixes the ROCm fallback path.

* add 'link' report result type

This will be used to communicate external download links
such as profiling results.

* add profiling data infrastructure

A new ProfileResult type is added to run_eval, which is
is returned in the EvalResult type. Among other fields,
this contains the `download_url` field which should be
used by the user to download profiling data. Note that
the actual public download link may not be known in
run_eval.py. In this case, it is the intention that the
launcher fixes up the `download_url` before returning the
results back to libkernelbot.

* github launcher: separate artifact downloading from indexing

The new function `GitHubRun.get_artifact_index` returns a
dict of artifacts available from the run. For each artifact,
the GitHub API URL and public download URL are returned.

The latter is not available directly from the GitHub API,
however, it can be easily constructed from the data that is
available in the worflow result.

`download_artifacts` is replaced by a function which downloads
a specific artifact rather than all of them. Additionally, the
function no longer writes to a temp file when downloading the
artifact; the results of the download request can be piped
directly into zipfile using BytesIO.

* github runner: yield 'profile_data/*' from job as profile data

The idea is that eval_run.py places profiling data in the
profile_data/ directory, which is then automatically exported
to the user. This is done by uploading that directory as the
'profile-data' artifact, then fetching its public download
link and returning that as the ProfileResult.download_url.
diff --git a/.github/workflows/amd_workflow.yml b/.github/workflows/amd_workflow.yml
@@ -35,13 +35,13 @@ jobs:
       run: |
         # Extract the payload content without printing it
         PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH)
-        
+
         # Apply mask to the extracted content
         echo "::add-mask::$PAYLOAD"
-        
+
         # Now write to file (won't be logged since it's masked)
         echo "$PAYLOAD" > payload.json
-    
+
     - name: Set venv directory based on runner
       run: |
         if [[ "${{ github.event.inputs.runner }}" == "amdgpu-mi250-x86-64" ]]; then
@@ -77,5 +77,12 @@ jobs:
       if: always()
       with:
         name: run-result
-        path: |
-          result.json
+        path: result.json
+
+    - name: Upload profiling artifacts
+      uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: profile-data
+        path: profile_data/*
+        retention-days: 1
diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
@@ -42,10 +42,10 @@ jobs:
         # Extract the payload content without printing it
         apt-get update && apt-get install -y jq
         PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH)
-        
+
         # Apply mask to the extracted content
         echo "::add-mask::$PAYLOAD"
-        
+
         # Now write to file (won't be logged since it's masked)
         echo "$PAYLOAD" > payload.json
 
@@ -73,15 +73,20 @@ jobs:
       shell: bash
       run: |
         python src/runners/github-runner.py
-        cat result.json  # Debug: show output
 
     - name: Upload training artifacts
       uses: actions/upload-artifact@v4
       if: always()
       with:
         name: run-result
-        path: |
-          result.json
+        path: result.json
 
+    - name: Upload profiling artifacts
+      uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: profile-data
+        path: profile_data/*
+        retention-days: 1
     env:
       CUDA_VISIBLE_DEVICES: 0
diff --git a/scripts/ci_test_cuda.py b/scripts/ci_test_cuda.py
@@ -4,7 +4,7 @@
 import pytest
 
 from libkernelbot.consts import ExitCode, SubmissionMode
-from libkernelbot.run_eval import compile_cuda_script, run_cuda_script
+from libkernelbot.run_eval import compile_cuda_script, make_system_info, run_cuda_script
 
 ref = Path("examples/identity_cuda/reference.cuh").read_text()
 task_h = Path("examples/identity_cuda/task.h").read_text()
@@ -19,6 +19,7 @@ def run_cuda_helper(sources: dict, headers: dict = None, arch=None, **kwargs):
         headers = header_files
 
     eval_result = run_cuda_script(
+        make_system_info(),
         sources,
         headers,
         arch=arch,
@@ -194,6 +195,7 @@ def test_include_dirs(tmp_path: Path):
 
     # can also use generic flags argument
     result = run_cuda_script(
+        make_system_info(),
         {"eval.cu": eval_cu, "submission.cu": sub},
         header_files,
         flags=["-I.", f"-I{tmp_path}"],
diff --git a/scripts/ci_test_python.py b/scripts/ci_test_python.py
@@ -1,7 +1,7 @@
 from pathlib import Path
 
 from libkernelbot.consts import ExitCode, SubmissionMode
-from libkernelbot.run_eval import run_pytorch_script
+from libkernelbot.run_eval import make_system_info, run_pytorch_script
 
 ref = Path("examples/identity_py/reference.py").read_text()
 task = Path("examples/identity_py/task.py").read_text()
@@ -12,6 +12,7 @@
 
 def run_pytorch_helper(sources: dict, tests=None, **kwargs):
     result = run_pytorch_script(
+        make_system_info(),
         sources,
         "eval.py",
         mode=SubmissionMode.TEST.value,
diff --git a/src/kernelbot/discord_reporter.py b/src/kernelbot/discord_reporter.py
@@ -2,6 +2,7 @@
 from discord_utils import _send_split_log
 
 from libkernelbot.report import (
+    Link,
     Log,
     MultiProgressReporter,
     RunProgressReporter,
@@ -69,6 +70,11 @@ async def display_report(self, title: str, report: RunResultReport):
                 message += part.text
             elif isinstance(part, Log):
                 message = await _send_split_log(thread, message, part.header, part.content)
+            elif isinstance(part, Link):
+                if len(message) > 0:
+                    await thread.send(message)
+                    message = ""
+                await thread.send(f"{part.title}: [{part.text}]({part.url})")
 
         if len(message) > 0:
             await thread.send(message)
diff --git a/src/libkernelbot/launchers/github.py b/src/libkernelbot/launchers/github.py
@@ -1,10 +1,11 @@
 import asyncio
 import base64
+import dataclasses
 import datetime
+import io
 import json
 import math
 import pprint
-import tempfile
 import uuid
 import zipfile
 import zlib
@@ -23,7 +24,14 @@
     SubmissionMode,
 )
 from libkernelbot.report import RunProgressReporter
-from libkernelbot.run_eval import CompileResult, EvalResult, FullResult, RunResult, SystemInfo
+from libkernelbot.run_eval import (
+    CompileResult,
+    EvalResult,
+    FullResult,
+    ProfileResult,
+    RunResult,
+    SystemInfo,
+)
 from libkernelbot.utils import setup_logging
 
 from .launcher import Launcher
@@ -49,7 +57,7 @@ def __init__(self, repo: str, token: str, branch: str):
         self.token = token
         self.branch = branch
 
-    async def run_submission(
+    async def run_submission(  # noqa: C901
         self, config: dict, gpu_type: GPU, status: RunProgressReporter
     ) -> FullResult:
         gpu_vendor = None
@@ -106,15 +114,17 @@ async def run_submission(
         await status.push("Downloading artifacts...")
         logger.info("Downloading artifacts...")
 
-        artifacts = await run.download_artifacts()
-        if "run-result" not in artifacts:
-            logger.error("Could not find `run-result` among artifacts: %s", artifacts.keys())
+        index = run.get_artifact_index()
+
+        if "run-result" not in index:
+            logger.error("Could not find `run-result` among artifacts: %s", index.keys())
             await status.push("Downloading artifacts...  failed")
             return FullResult(
                 success=False, error="Could not download artifacts", runs={}, system=SystemInfo()
             )
 
-        logs = artifacts["run-result"]["result.json"].decode("utf-8")
+        artifact = await run.download_artifact(index["run-result"])
+        logs = artifact["result.json"].decode("utf-8")
 
         await status.update("Downloading artifacts... done")
         logger.info("Downloading artifacts... done")
@@ -123,17 +133,24 @@ async def run_submission(
         runs = {}
         # convert json back to EvalResult structures, which requires
         # special handling for datetime and our dataclasses.
+
         for k, v in data["runs"].items():
-            if "compilation" in v and v["compilation"] is not None:
-                comp = CompileResult(**v["compilation"])
-            else:
-                comp = None
-            run = RunResult(**v["run"])
+            comp_res = None if v.get("compilation") is None else CompileResult(**v["compilation"])
+            run_res = None if v.get("run") is None else RunResult(**v["run"])
+            profile_res = None if v.get("profile") is None else ProfileResult(**v["profile"])
+
+            # Update profile artifact to the actual download URL.
+            # For the GitHub launcher the profile_artifact currently just contains
+            # the name of the artifact.
+            if profile_res is not None:
+                profile_res.download_url = index["profile-data"].public_download_url
+
             res = EvalResult(
                 start=datetime.datetime.fromisoformat(v["start"]),
                 end=datetime.datetime.fromisoformat(v["end"]),
-                compilation=comp,
-                run=run,
+                compilation=comp_res,
+                run=run_res,
+                profile=profile_res,
             )
             runs[k] = res
 
@@ -147,6 +164,13 @@ async def wait_callback(self, run: "GitHubRun", status: RunProgressReporter):
         )
 
 
+@dataclasses.dataclass
+class GitHubArtifact:
+    name: str
+    archive_download_url: str
+    public_download_url: str
+
+
 class GitHubRun:
     def __init__(self, repo: str, token: str, branch: str, workflow_file: str):
         gh = Github(token)
@@ -323,34 +347,43 @@ async def wait_for_completion(
                 logger.error(f"Error waiting for GitHub run {self.run_id}: {e}", exc_info=e)
                 raise  # Re-raise other exceptions
 
-    async def download_artifacts(self) -> dict:
-        logger.info("Attempting to download artifacts for run %s", self.run_id)
+
+    def get_artifact_index(self) -> dict[str, GitHubArtifact]:
+        logger.info("Creating artifact index for run %s", self.run_id)
         artifacts = self.run.get_artifacts()
 
         extracted = {}
 
         for artifact in artifacts:
-            url = artifact.archive_download_url
-            headers = {"Authorization": f"token {self.token}"}
-            response = requests.get(url, headers=headers)
-
-            if response.status_code == 200:
-                with tempfile.NamedTemporaryFile("w+b") as temp:
-                    temp.write(response.content)
-                    temp.flush()
-
-                    with zipfile.ZipFile(temp.name) as z:
-                        artifact_dict = {}
-                        for file in z.namelist():
-                            with z.open(file) as f:
-                                artifact_dict[file] = f.read()
-
-                extracted[artifact.name] = artifact_dict
-            else:
-                raise RuntimeError(
-                    f"Failed to download artifact {artifact.name}. "
-                    f"Status code: {response.status_code}"
-                )
+            extracted[artifact.name] = GitHubArtifact(
+                name=artifact.name,
+                archive_download_url=artifact.archive_download_url,
+                # Non-machine users cannot download from the archive_download_url and
+                # the GitHub API does not give us access to the public download url.
+                public_download_url=f"{self.repo.html_url}/actions/runs/{self.run_id}/artifacts/{artifact.id}",
+            )
 
-        logger.info("Download artifacts for run %s: %s", self.run_id, list(extracted.keys()))
         return extracted
+
+
+    async def download_artifact(self, artifact: GitHubArtifact) -> dict:
+        logger.info("Attempting to download artifact '%s' for run %s", artifact.name, self.run_id)
+
+        url = artifact.archive_download_url
+        headers = {"Authorization": f"token {self.token}"}
+        response = requests.get(url, headers=headers)
+
+        if response.status_code == 200:
+            artifact_dict = {}
+            with zipfile.ZipFile(io.BytesIO(response.content)) as z:
+                for file in z.namelist():
+                    with z.open(file) as f:
+                        artifact_dict[file] = f.read()
+
+            logger.info("Downloaded artifact '%s' for run %s", artifact.name, self.run_id)
+            return artifact_dict
+        else:
+            raise RuntimeError(
+                f"Failed to download artifact {artifact.name}. "
+                f"Status code: {response.status_code}"
+            )
diff --git a/src/libkernelbot/report.py b/src/libkernelbot/report.py
@@ -32,16 +32,30 @@ class Log:
     content: str
 
 
+@dataclasses.dataclass
+class Link:
+    """
+    Link represents a link in the profiling report, to result data
+    which can be downloaded by clicking it.
+    """
+    title: str
+    text: str
+    url: str
+
+
 class RunResultReport:
     def __init__(self, data=None):
-        self.data: List[Text | Log] = data or []
+        self.data: List[Text | Log | Link] = data or []
 
     def add_text(self, section: str):
         self.data.append(Text(section))
 
     def add_log(self, header: str, log: str):
         self.data.append(Log(header, log))
 
+    def add_link(self, title: str, text: str, url: str):
+        self.data.append(Link(title, text, url))
+
     def __repr__(self):
         return f"RunResultReport(data={self.data})"
 
@@ -267,6 +281,7 @@ def generate_system_info(system: SystemInfo):
 Running on:
 * GPU: `{system.gpu}`
 * CPU: `{system.cpu}`
+* Runtime: `{system.runtime}`
 * Platform: `{system.platform}`
 * Torch: `{system.torch}`
 """
@@ -322,6 +337,13 @@ def generate_report(result: FullResult) -> RunResultReport:  # noqa: C901
             make_profile_log(prof_run.run),
         )
 
+        if prof_run.profile is not None and prof_run.profile.download_url is not None:
+            report.add_link(
+                f"{prof_run.profile.profiler} profiling output",
+                "Download from GitHub",
+                prof_run.profile.download_url,
+            )
+
     if "leaderboard" in runs:
         bench_run = runs["leaderboard"]
         if _handle_crash_report(report, bench_run):
diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py
diff --git a/tests/test_backend.py b/tests/test_backend.py
diff --git a/tests/test_report.py b/tests/test_report.py