Skip to content

Commit 9e3b74a

Browse files
authored
Merge pull request #356 from alimaredia/fix-loss-graphs-ci
ci: Upload phase 1 & phase 2 training logs for loss graphs
2 parents 84c0f72 + 69fdaee commit 9e3b74a

File tree

3 files changed

+158
-38
lines changed

3 files changed

+158
-38
lines changed

.github/workflows/e2e-nvidia-l4-x1.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ jobs:
5151
with:
5252
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
5353
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
54-
aws-region: ${{ secrets.AWS_REGION }}
54+
aws-region: ${{ vars.AWS_REGION }}
5555

5656
- name: Start EC2 runner
5757
id: start-ec2-runner
@@ -187,7 +187,7 @@ jobs:
187187
with:
188188
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
189189
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
190-
aws-region: ${{ secrets.AWS_REGION }}
190+
aws-region: ${{ vars.AWS_REGION }}
191191

192192
- name: Stop EC2 runner
193193
uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7

.github/workflows/e2e-nvidia-l40s-x4.yml

Lines changed: 108 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ jobs:
3030
with:
3131
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
3232
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
33-
aws-region: ${{ secrets.AWS_REGION }}
33+
aws-region: ${{ vars.AWS_REGION }}
3434

3535
- name: Start EC2 runner
3636
id: start-ec2-runner
@@ -171,7 +171,7 @@ jobs:
171171
pip install .
172172
pip install .[cuda]
173173
174-
- name: Check disk
174+
- name: Check disk before tests
175175
run: |
176176
df -h
177177
@@ -188,14 +188,30 @@ jobs:
188188
# we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
189189
# and we know that it will be written into a directory created by `mktemp -d`.
190190
# Given this information, we can use the following command to find the file:
191-
log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl")
192-
mv "${log_file}" training-log.jsonl
191+
log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
192+
phase_num=1;
193+
for log_file in $log_files; do
194+
mv "${log_file}" phase-${phase_num}-training-log.jsonl
195+
((phase_num++))
196+
done
197+
198+
- name: Check disk after tests
199+
run: |
200+
df -h
201+
202+
- name: Upload training logs Phase 1
203+
uses: actions/upload-artifact@v4
204+
with:
205+
name: phase-1-training-log.jsonl
206+
path: ./instructlab/phase-1-training-log.jsonl
207+
retention-days: 1
208+
overwrite: true
193209

194-
- name: Upload training logs
210+
- name: Upload training logs Phase 2
195211
uses: actions/upload-artifact@v4
196212
with:
197-
name: training-log.jsonl
198-
path: ./instructlab/training-log.jsonl
213+
name: phase-2-training-log.jsonl
214+
path: ./instructlab/phase-2-training-log.jsonl
199215
retention-days: 1
200216
overwrite: true
201217

@@ -259,7 +275,7 @@ jobs:
259275
with:
260276
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
261277
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
262-
aws-region: ${{ secrets.AWS_REGION }}
278+
aws-region: ${{ vars.AWS_REGION }}
263279

264280
- name: Stop EC2 runner
265281
uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
@@ -269,36 +285,102 @@ jobs:
269285
label: ${{ needs.start-large-ec2-runner.outputs.label }}
270286
ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
271287

272-
- name: Download loss data
273-
id: download-logs
288+
loss-graphs:
289+
needs:
290+
- stop-large-ec2-runner
291+
runs-on: ubuntu-latest
292+
if: ${{ always() }}
293+
steps:
294+
- name: "Harden Runner"
295+
# v2.10.1
296+
uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7
297+
with:
298+
egress-policy: audit
299+
300+
- name: Configure AWS credentials
301+
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
302+
with:
303+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
304+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
305+
aws-region: ${{ vars.AWS_REGION }}
306+
307+
- name: Download loss data Phase 1
308+
id: phase-1-download-logs
309+
uses: actions/download-artifact@v4
310+
with:
311+
name: phase-1-training-log.jsonl
312+
path: downloaded-data
313+
314+
- name: Download loss data Phase 2
315+
id: phase-2-download-logs
274316
uses: actions/download-artifact@v4
275317
with:
276-
name: training-log.jsonl
318+
name: phase-2-training-log.jsonl
277319
path: downloaded-data
278320

321+
- name: Checkout instructlab/training
322+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
323+
with:
324+
repository: "instructlab/training"
325+
path: "training"
326+
fetch-depth: 0
327+
279328
- name: Install dependencies
329+
working-directory: ./training
280330
run: |
331+
python -m pip install --upgrade pip
281332
pip install -r requirements-dev.txt
282-
283-
- name: Try to upload to s3
284-
id: upload-s3
333+
334+
- name: Try to upload Phase 1 to s3
335+
id: phase-1-upload-s3
285336
continue-on-error: true
286337
run: |
287-
output_file='./test.md'
288-
python scripts/create-loss-graph.py \
289-
--log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
290-
--output-file "${output_file}" \
338+
python training/scripts/create-loss-graph.py \
339+
--log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
340+
--output-file "./phase-1-test.md" \
341+
--phase "1" \
291342
--aws-region "${{ vars.AWS_REGION }}" \
292343
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
293-
--base-branch "${{ github.event.pull_request.base.ref }}" \
294-
--pr-number "${{ github.event.pull_request.number }}" \
295-
--head-sha "${{ github.event.pull_request.head.sha }}" \
344+
--base-branch "${GITHUB_REF##*/}" \
345+
--head-sha "${{ github.sha }}" \
346+
--pr-number "${{ github.event.number }}" \
296347
--origin-repository "${{ github.repository }}"
297348
298-
- name: Check S3 upload status
299-
if: steps.upload-s3.outcome == 'failure'
349+
- name: Try to upload Phase 2 to s3
350+
id: phase-2-upload-s3
351+
continue-on-error: true
300352
run: |
301-
echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate."
353+
python training/scripts/create-loss-graph.py \
354+
--log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
355+
--output-file "./phase-2-test.md" \
356+
--phase "2" \
357+
--aws-region "${{ vars.AWS_REGION }}" \
358+
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
359+
--base-branch "${GITHUB_REF##*/}" \
360+
--head-sha "${{ github.sha }}" \
361+
--pr-number "${{ github.event.number }}" \
362+
--origin-repository "${{ github.repository }}"
363+
364+
- name: Check Phase 1 S3 upload status for success
365+
if: steps.phase-1-upload-s3.outcome == 'success'
366+
run: |
367+
echo "Uploaded Phase 1 loss graph to S3."
368+
cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"
369+
370+
- name: Check Phase 2 S3 upload status for success
371+
if: steps.phase-2-upload-s3.outcome == 'success'
372+
run: |
373+
echo "Uploaded Phase 2 loss graph to S3."
374+
cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"
375+
376+
- name: Check Phase 1 S3 upload status for failure
377+
if: steps.phase-1-upload-s3.outcome == 'failure'
378+
run: |
379+
echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
380+
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
381+
382+
- name: Check Phase 2 S3 upload status for failure
383+
if: steps.phase-2-upload-s3.outcome == 'failure'
384+
run: |
385+
echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
302386
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
303-
304-
cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"

scripts/create-loss-graph.py

Lines changed: 48 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,13 @@
1414
class Arguments(BaseModel):
1515
log_file: str | None = None
1616
output_file: str
17+
phase: str | None = None
18+
title: str | None = None
1719
aws_region: str
1820
bucket_name: str
1921
base_branch: str
20-
pr_number: str
2122
head_sha: str
23+
pr_number: str | None
2224
origin_repository: str
2325

2426

@@ -76,21 +78,37 @@ def write_to_s3(
7678
["aws", "s3", "cp", str(file), s3_path], capture_output=True, check=True
7779
)
7880
if results.returncode != 0:
79-
raise RuntimeError(f"failed to upload to s3: {results.stderr.decode('utf-8')}")
81+
raise RuntimeError(f"Failed to upload to s3: {results.stderr.decode('utf-8')}")
8082
else:
8183
print(results.stdout.decode("utf-8"))
8284

8385

84-
def get_destination_path(base_ref: str, pr_number: str, head_sha: str):
85-
return f"pulls/{base_ref}/{pr_number}/{head_sha}/loss-graph.png"
86+
def get_destination_path(base_ref: str, head_sha: str, phase: str | None):
87+
if phase is None:
88+
image_file_name = "loss-graph.png"
89+
else:
90+
image_file_name = f"loss-graph-{phase}.png"
91+
return f"loss_graphs/{base_ref}/{head_sha}/{image_file_name}"
8692

8793

8894
def write_md_file(
89-
output_file: Path, url: str, pr_number: str, head_sha: str, origin_repository: str
95+
output_file: Path,
96+
url: str,
97+
head_sha: str,
98+
origin_repository: str,
99+
title: str,
100+
pr_number: str | None,
90101
):
91102
commit_url = f"https://github.com/{origin_repository}/commit/{head_sha}"
103+
104+
if pr_number:
105+
pr_url = f"https://github.com/{origin_repository}/pull/{pr_number}"
106+
pr_str = f" ([PR {pr_number}]({pr_url}))"
107+
else:
108+
pr_str = ""
109+
92110
md_template = f"""
93-
# Loss Graph for PR {args.pr_number} ([{args.head_sha[:7]}]({commit_url}))
111+
# {title} ([{head_sha[:7]}]({commit_url})){pr_str}
94112
95113
![Loss Graph]({url})
96114
"""
@@ -107,9 +125,16 @@ def main(args: Arguments):
107125
loss_data = read_loss_data(log_file=log_file)
108126
output_image = Path("/tmp/loss-graph.png")
109127
output_file = Path(args.output_file)
128+
title = args.title
129+
if not title:
130+
if args.phase is None:
131+
phase_str = ""
132+
else:
133+
phase_str = f" for Phase {args.phase}"
134+
title = f"Training Loss Graph{phase_str}"
110135
render_image(loss_data=loss_data, outfile=output_image)
111136
destination_path = get_destination_path(
112-
base_ref=args.base_branch, pr_number=args.pr_number, head_sha=args.head_sha
137+
base_ref=args.base_branch, head_sha=args.head_sha, phase=args.phase
113138
)
114139
write_to_s3(
115140
file=output_image, bucket_name=args.bucket_name, destination=destination_path
@@ -122,9 +147,10 @@ def main(args: Arguments):
122147
write_md_file(
123148
output_file=output_file,
124149
url=s3_url,
125-
pr_number=args.pr_number,
126150
head_sha=args.head_sha,
127151
origin_repository=args.origin_repository,
152+
title=title,
153+
pr_number=args.pr_number,
128154
)
129155
print(f"Loss graph uploaded to '{s3_url}'")
130156
print(f"Markdown file written to '{output_file}'")
@@ -145,6 +171,16 @@ def main(args: Arguments):
145171
required=True,
146172
help="The output file where the resulting markdown will be written.",
147173
)
174+
parser.add_argument(
175+
"--phase",
176+
type=str,
177+
help="Phase of the loss graph to use for storage and within the title (if not specified)",
178+
)
179+
parser.add_argument(
180+
"--title",
181+
type=str,
182+
help="Title of the loss graph to use in the markdown output",
183+
)
148184
parser.add_argument(
149185
"--aws-region",
150186
type=str,
@@ -160,10 +196,10 @@ def main(args: Arguments):
160196
required=True,
161197
help="The base branch being merged to.",
162198
)
163-
parser.add_argument("--pr-number", type=str, required=True, help="The PR number")
164199
parser.add_argument(
165200
"--head-sha", type=str, required=True, help="The head SHA of the PR"
166201
)
202+
parser.add_argument("--pr-number", type=str, help="The PR number if applicable")
167203
parser.add_argument(
168204
"--origin-repository",
169205
type=str,
@@ -176,11 +212,13 @@ def main(args: Arguments):
176212
arguments = Arguments(
177213
log_file=args.log_file,
178214
output_file=args.output_file,
215+
phase=args.phase,
216+
title=args.title,
179217
aws_region=args.aws_region,
180218
bucket_name=args.bucket_name,
181219
base_branch=args.base_branch,
182-
pr_number=args.pr_number,
183220
head_sha=args.head_sha,
221+
pr_number=args.pr_number,
184222
origin_repository=args.origin_repository,
185223
)
186224
main(arguments)

0 commit comments

Comments
 (0)