Merge pull request #356 from alimaredia/fix-loss-graphs-ci

mergify[bot] · web-flow · commit 9e3b74a6854f · 2024-11-29T18:40:25.000Z
ci: Upload phase 1 &amp; phase 2 training logs for loss graphs
diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml
@@ -51,7 +51,7 @@ jobs:
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ secrets.AWS_REGION }}
+          aws-region: ${{ vars.AWS_REGION }}
 
       - name: Start EC2 runner
         id: start-ec2-runner
@@ -187,7 +187,7 @@ jobs:
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ secrets.AWS_REGION }}
+          aws-region: ${{ vars.AWS_REGION }}
 
       - name: Stop EC2 runner
         uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml
@@ -30,7 +30,7 @@ jobs:
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ secrets.AWS_REGION }}
+          aws-region: ${{ vars.AWS_REGION }}
 
       - name: Start EC2 runner
         id: start-ec2-runner
@@ -171,7 +171,7 @@ jobs:
           pip install .
           pip install .[cuda]
 
-      - name: Check disk
+      - name: Check disk before tests
         run: |
           df -h
 
@@ -188,14 +188,30 @@ jobs:
           # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
           # and we know that it will be written into a directory created by `mktemp -d`. 
           # Given this information, we can use the following command to find the file:
-          log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl")
-          mv "${log_file}" training-log.jsonl
+          log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
+          phase_num=1;
+          for log_file in $log_files; do
+              mv "${log_file}" phase-${phase_num}-training-log.jsonl
+              ((phase_num++))
+          done
+
+      - name: Check disk after tests
+        run: |
+          df -h
+
+      - name: Upload training logs Phase 1
+        uses: actions/upload-artifact@v4
+        with:
+          name: phase-1-training-log.jsonl
+          path: ./instructlab/phase-1-training-log.jsonl
+          retention-days: 1
+          overwrite: true
 
-      - name: Upload training logs
+      - name: Upload training logs Phase 2
         uses: actions/upload-artifact@v4
         with:
-          name: training-log.jsonl
-          path: ./instructlab/training-log.jsonl
+          name: phase-2-training-log.jsonl
+          path: ./instructlab/phase-2-training-log.jsonl
           retention-days: 1
           overwrite: true
 
@@ -259,7 +275,7 @@ jobs:
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ secrets.AWS_REGION }}
+          aws-region: ${{ vars.AWS_REGION }}
 
       - name: Stop EC2 runner
         uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
@@ -269,36 +285,102 @@ jobs:
           label: ${{ needs.start-large-ec2-runner.outputs.label }}
           ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
 
-      - name: Download loss data
-        id: download-logs
+  loss-graphs:
+    needs:
+      - stop-large-ec2-runner
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - name: "Harden Runner"
+        # v2.10.1
+        uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7
+        with:
+          egress-policy: audit
+
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ vars.AWS_REGION }}
+
+      - name: Download loss data Phase 1
+        id: phase-1-download-logs
+        uses: actions/download-artifact@v4
+        with:
+          name: phase-1-training-log.jsonl
+          path: downloaded-data
+
+      - name: Download loss data Phase 2
+        id: phase-2-download-logs
         uses: actions/download-artifact@v4
         with:
-          name: training-log.jsonl
+          name: phase-2-training-log.jsonl
           path: downloaded-data
 
+      - name: Checkout instructlab/training
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: "instructlab/training"
+          path: "training"
+          fetch-depth: 0
+
       - name: Install dependencies
+        working-directory: ./training
         run: |
+          python -m pip install --upgrade pip
           pip install -r requirements-dev.txt
-      
-      - name: Try to upload to s3
-        id: upload-s3
+
+      - name: Try to upload Phase 1 to s3
+        id: phase-1-upload-s3
         continue-on-error: true
         run: |
-          output_file='./test.md' 
-          python scripts/create-loss-graph.py  \
-            --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
-            --output-file "${output_file}" \
+          python training/scripts/create-loss-graph.py  \
+            --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
+            --output-file "./phase-1-test.md" \
+            --phase "1" \
             --aws-region "${{ vars.AWS_REGION }}" \
             --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
-            --base-branch "${{ github.event.pull_request.base.ref }}" \
-            --pr-number "${{ github.event.pull_request.number }}" \
-            --head-sha "${{ github.event.pull_request.head.sha }}" \
+            --base-branch "${GITHUB_REF##*/}" \
+            --head-sha "${{ github.sha }}" \
+            --pr-number "${{ github.event.number }}" \
             --origin-repository "${{ github.repository }}"
 
-      - name: Check S3 upload status
-        if: steps.upload-s3.outcome == 'failure'
+      - name: Try to upload Phase 2 to s3
+        id: phase-2-upload-s3
+        continue-on-error: true
         run: |
-          echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate."
+          python training/scripts/create-loss-graph.py  \
+            --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
+            --output-file "./phase-2-test.md" \
+            --phase "2" \
+            --aws-region "${{ vars.AWS_REGION }}" \
+            --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
+            --base-branch "${GITHUB_REF##*/}" \
+            --head-sha "${{ github.sha }}" \
+            --pr-number "${{ github.event.number }}" \
+            --origin-repository "${{ github.repository }}"
+
+      - name: Check Phase 1 S3 upload status for success
+        if: steps.phase-1-upload-s3.outcome == 'success'
+        run: |
+          echo "Uploaded Phase 1 loss graph to S3."
+          cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"
+
+      - name: Check Phase 2 S3 upload status for success
+        if: steps.phase-2-upload-s3.outcome == 'success'
+        run: |
+          echo "Uploaded Phase 2 loss graph to S3."
+          cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"
+
+      - name: Check Phase 1 S3 upload status for failure
+        if: steps.phase-1-upload-s3.outcome == 'failure'
+        run: |
+          echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
+          echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
+
+      - name: Check Phase 2 S3 upload status for failure
+        if: steps.phase-2-upload-s3.outcome == 'failure'
+        run: |
+          echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
           echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
-  
-          cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
diff --git a/scripts/create-loss-graph.py b/scripts/create-loss-graph.py
@@ -14,11 +14,13 @@
 class Arguments(BaseModel):
     log_file: str | None = None
     output_file: str
+    phase: str | None = None
+    title: str | None = None
     aws_region: str
     bucket_name: str
     base_branch: str
-    pr_number: str
     head_sha: str
+    pr_number: str | None
     origin_repository: str
 
 
@@ -76,21 +78,37 @@ def write_to_s3(
         ["aws", "s3", "cp", str(file), s3_path], capture_output=True, check=True
     )
     if results.returncode != 0:
-        raise RuntimeError(f"failed to upload to s3: {results.stderr.decode('utf-8')}")
+        raise RuntimeError(f"Failed to upload to s3: {results.stderr.decode('utf-8')}")
     else:
         print(results.stdout.decode("utf-8"))
 
 
-def get_destination_path(base_ref: str, pr_number: str, head_sha: str):
-    return f"pulls/{base_ref}/{pr_number}/{head_sha}/loss-graph.png"
+def get_destination_path(base_ref: str, head_sha: str, phase: str | None):
+    if phase is None:
+        image_file_name = "loss-graph.png"
+    else:
+        image_file_name = f"loss-graph-{phase}.png"
+    return f"loss_graphs/{base_ref}/{head_sha}/{image_file_name}"
 
 
 def write_md_file(
-    output_file: Path, url: str, pr_number: str, head_sha: str, origin_repository: str
+    output_file: Path,
+    url: str,
+    head_sha: str,
+    origin_repository: str,
+    title: str,
+    pr_number: str | None,
 ):
     commit_url = f"https://github.com/{origin_repository}/commit/{head_sha}"
+
+    if pr_number:
+        pr_url = f"https://github.com/{origin_repository}/pull/{pr_number}"
+        pr_str = f" ([PR {pr_number}]({pr_url}))"
+    else:
+        pr_str = ""
+
     md_template = f"""
-# Loss Graph for PR {args.pr_number} ([{args.head_sha[:7]}]({commit_url}))
+# {title} ([{head_sha[:7]}]({commit_url})){pr_str}
 
 ![Loss Graph]({url})
 """
@@ -107,9 +125,16 @@ def main(args: Arguments):
     loss_data = read_loss_data(log_file=log_file)
     output_image = Path("/tmp/loss-graph.png")
     output_file = Path(args.output_file)
+    title = args.title
+    if not title:
+        if args.phase is None:
+            phase_str = ""
+        else:
+            phase_str = f" for Phase {args.phase}"
+        title = f"Training Loss Graph{phase_str}"
     render_image(loss_data=loss_data, outfile=output_image)
     destination_path = get_destination_path(
-        base_ref=args.base_branch, pr_number=args.pr_number, head_sha=args.head_sha
+        base_ref=args.base_branch, head_sha=args.head_sha, phase=args.phase
     )
     write_to_s3(
         file=output_image, bucket_name=args.bucket_name, destination=destination_path
@@ -122,9 +147,10 @@ def main(args: Arguments):
     write_md_file(
         output_file=output_file,
         url=s3_url,
-        pr_number=args.pr_number,
         head_sha=args.head_sha,
         origin_repository=args.origin_repository,
+        title=title,
+        pr_number=args.pr_number,
     )
     print(f"Loss graph uploaded to '{s3_url}'")
     print(f"Markdown file written to '{output_file}'")
@@ -145,6 +171,16 @@ def main(args: Arguments):
         required=True,
         help="The output file where the resulting markdown will be written.",
     )
+    parser.add_argument(
+        "--phase",
+        type=str,
+        help="Phase of the loss graph to use for storage and within the title (if not specified)",
+    )
+    parser.add_argument(
+        "--title",
+        type=str,
+        help="Title of the loss graph to use in the markdown output",
+    )
     parser.add_argument(
         "--aws-region",
         type=str,
@@ -160,10 +196,10 @@ def main(args: Arguments):
         required=True,
         help="The base branch being merged to.",
     )
-    parser.add_argument("--pr-number", type=str, required=True, help="The PR number")
     parser.add_argument(
         "--head-sha", type=str, required=True, help="The head SHA of the PR"
     )
+    parser.add_argument("--pr-number", type=str, help="The PR number if applicable")
     parser.add_argument(
         "--origin-repository",
         type=str,
@@ -176,11 +212,13 @@ def main(args: Arguments):
     arguments = Arguments(
         log_file=args.log_file,
         output_file=args.output_file,
+        phase=args.phase,
+        title=args.title,
         aws_region=args.aws_region,
         bucket_name=args.bucket_name,
         base_branch=args.base_branch,
-        pr_number=args.pr_number,
         head_sha=args.head_sha,
+        pr_number=args.pr_number,
         origin_repository=args.origin_repository,
     )
     main(arguments)