30
30
with :
31
31
aws-access-key-id : ${{ secrets.AWS_ACCESS_KEY_ID }}
32
32
aws-secret-access-key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
33
- aws-region : ${{ secrets .AWS_REGION }}
33
+ aws-region : ${{ vars .AWS_REGION }}
34
34
35
35
- name : Start EC2 runner
36
36
id : start-ec2-runner
@@ -171,7 +171,7 @@ jobs:
171
171
pip install .
172
172
pip install .[cuda]
173
173
174
- - name : Check disk
174
+ - name : Check disk before tests
175
175
run : |
176
176
df -h
177
177
@@ -188,14 +188,30 @@ jobs:
188
188
# we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
189
189
# and we know that it will be written into a directory created by `mktemp -d`.
190
190
# Given this information, we can use the following command to find the file:
191
- log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl")
192
- mv "${log_file}" training-log.jsonl
191
+ log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
192
+ phase_num=1;
193
+ for log_file in $log_files; do
194
+ mv "${log_file}" phase-${phase_num}-training-log.jsonl
195
+ ((phase_num++))
196
+ done
197
+
198
+ - name : Check disk after tests
199
+ run : |
200
+ df -h
201
+
202
+ - name : Upload training logs Phase 1
203
+ uses : actions/upload-artifact@v4
204
+ with :
205
+ name : phase-1-training-log.jsonl
206
+ path : ./instructlab/phase-1-training-log.jsonl
207
+ retention-days : 1
208
+ overwrite : true
193
209
194
- - name : Upload training logs
210
+ - name : Upload training logs Phase 2
195
211
uses : actions/upload-artifact@v4
196
212
with :
197
- name : training-log.jsonl
198
- path : ./instructlab/training-log.jsonl
213
+ name : phase-2- training-log.jsonl
214
+ path : ./instructlab/phase-2- training-log.jsonl
199
215
retention-days : 1
200
216
overwrite : true
201
217
@@ -259,7 +275,7 @@ jobs:
259
275
with :
260
276
aws-access-key-id : ${{ secrets.AWS_ACCESS_KEY_ID }}
261
277
aws-secret-access-key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
262
- aws-region : ${{ secrets .AWS_REGION }}
278
+ aws-region : ${{ vars .AWS_REGION }}
263
279
264
280
- name : Stop EC2 runner
265
281
uses : machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
@@ -269,36 +285,102 @@ jobs:
269
285
label : ${{ needs.start-large-ec2-runner.outputs.label }}
270
286
ec2-instance-id : ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
271
287
272
- - name : Download loss data
273
- id : download-logs
288
+ loss-graphs :
289
+ needs :
290
+ - stop-large-ec2-runner
291
+ runs-on : ubuntu-latest
292
+ if : ${{ always() }}
293
+ steps :
294
+ - name : " Harden Runner"
295
+ # v2.10.1
296
+ uses : step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7
297
+ with :
298
+ egress-policy : audit
299
+
300
+ - name : Configure AWS credentials
301
+ uses : aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
302
+ with :
303
+ aws-access-key-id : ${{ secrets.AWS_ACCESS_KEY_ID }}
304
+ aws-secret-access-key : ${{ secrets.AWS_SECRET_ACCESS_KEY }}
305
+ aws-region : ${{ vars.AWS_REGION }}
306
+
307
+ - name : Download loss data Phase 1
308
+ id : phase-1-download-logs
309
+ uses : actions/download-artifact@v4
310
+ with :
311
+ name : phase-1-training-log.jsonl
312
+ path : downloaded-data
313
+
314
+ - name : Download loss data Phase 2
315
+ id : phase-2-download-logs
274
316
uses : actions/download-artifact@v4
275
317
with :
276
- name : training-log.jsonl
318
+ name : phase-2- training-log.jsonl
277
319
path : downloaded-data
278
320
321
+ - name : Checkout instructlab/training
322
+ uses : actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
323
+ with :
324
+ repository : " instructlab/training"
325
+ path : " training"
326
+ fetch-depth : 0
327
+
279
328
- name : Install dependencies
329
+ working-directory : ./training
280
330
run : |
331
+ python -m pip install --upgrade pip
281
332
pip install -r requirements-dev.txt
282
-
283
- - name : Try to upload to s3
284
- id : upload-s3
333
+
334
+ - name : Try to upload Phase 1 to s3
335
+ id : phase-1- upload-s3
285
336
continue-on-error : true
286
337
run : |
287
- output_file='./test.md'
288
- python scripts/create-loss-graph.py \
289
- --log -file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl " \
290
- --output-file "${output_file} " \
338
+ python training/scripts/create-loss-graph.py \
339
+ --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
340
+ --output -file "./phase-1-test.md " \
341
+ --phase "1 " \
291
342
--aws-region "${{ vars.AWS_REGION }}" \
292
343
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
293
- --base-branch "${{ github.event.pull_request.base.ref } }" \
294
- --pr-number "${{ github.event.pull_request.number }}" \
295
- --head-sha "${{ github.event.pull_request.head.sha }}" \
344
+ --base-branch "${GITHUB_REF##*/ }" \
345
+ --head-sha "${{ github.sha }}" \
346
+ --pr-number "${{ github.event.number }}" \
296
347
--origin-repository "${{ github.repository }}"
297
348
298
- - name : Check S3 upload status
299
- if : steps.upload-s3.outcome == 'failure'
349
+ - name : Try to upload Phase 2 to s3
350
+ id : phase-2-upload-s3
351
+ continue-on-error : true
300
352
run : |
301
- echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate."
353
+ python training/scripts/create-loss-graph.py \
354
+ --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
355
+ --output-file "./phase-2-test.md" \
356
+ --phase "2" \
357
+ --aws-region "${{ vars.AWS_REGION }}" \
358
+ --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
359
+ --base-branch "${GITHUB_REF##*/}" \
360
+ --head-sha "${{ github.sha }}" \
361
+ --pr-number "${{ github.event.number }}" \
362
+ --origin-repository "${{ github.repository }}"
363
+
364
+ - name : Check Phase 1 S3 upload status for success
365
+ if : steps.phase-1-upload-s3.outcome == 'success'
366
+ run : |
367
+ echo "Uploaded Phase 1 loss graph to S3."
368
+ cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"
369
+
370
+ - name : Check Phase 2 S3 upload status for success
371
+ if : steps.phase-2-upload-s3.outcome == 'success'
372
+ run : |
373
+ echo "Uploaded Phase 2 loss graph to S3."
374
+ cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"
375
+
376
+ - name : Check Phase 1 S3 upload status for failure
377
+ if : steps.phase-1-upload-s3.outcome == 'failure'
378
+ run : |
379
+ echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
380
+ echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
381
+
382
+ - name : Check Phase 2 S3 upload status for failure
383
+ if : steps.phase-2-upload-s3.outcome == 'failure'
384
+ run : |
385
+ echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
302
386
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
303
-
304
- cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
0 commit comments