From 69fdaee6c339f96404099afb4a48efaa6a1f570e Mon Sep 17 00:00:00 2001 From: Ali Maredia Date: Mon, 25 Nov 2024 09:05:12 -0500 Subject: [PATCH] ci: Upload phase 1 & phase 2 training logs for loss graphs Signed-off-by: Ali Maredia Co-authored-by: Dan McPherson --- .github/workflows/e2e-nvidia-l4-x1.yml | 4 +- .github/workflows/e2e-nvidia-l40s-x4.yml | 134 ++++++++++++++++++----- scripts/create-loss-graph.py | 58 ++++++++-- 3 files changed, 158 insertions(+), 38 deletions(-) diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml index ef511319..752dccf1 100644 --- a/.github/workflows/e2e-nvidia-l4-x1.yml +++ b/.github/workflows/e2e-nvidia-l4-x1.yml @@ -51,7 +51,7 @@ jobs: with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ secrets.AWS_REGION }} + aws-region: ${{ vars.AWS_REGION }} - name: Start EC2 runner id: start-ec2-runner @@ -187,7 +187,7 @@ jobs: with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ secrets.AWS_REGION }} + aws-region: ${{ vars.AWS_REGION }} - name: Stop EC2 runner uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7 diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml index 52015d5f..70ad867a 100644 --- a/.github/workflows/e2e-nvidia-l40s-x4.yml +++ b/.github/workflows/e2e-nvidia-l40s-x4.yml @@ -30,7 +30,7 @@ jobs: with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ secrets.AWS_REGION }} + aws-region: ${{ vars.AWS_REGION }} - name: Start EC2 runner id: start-ec2-runner @@ -171,7 +171,7 @@ jobs: pip install . pip install .[cuda] - - name: Check disk + - name: Check disk before tests run: | df -h @@ -188,14 +188,30 @@ jobs: # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python # and we know that it will be written into a directory created by `mktemp -d`. # Given this information, we can use the following command to find the file: - log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl") - mv "${log_file}" training-log.jsonl + log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl") + phase_num=1; + for log_file in $log_files; do + mv "${log_file}" phase-${phase_num}-training-log.jsonl + ((phase_num++)) + done + + - name: Check disk after tests + run: | + df -h + + - name: Upload training logs Phase 1 + uses: actions/upload-artifact@v4 + with: + name: phase-1-training-log.jsonl + path: ./instructlab/phase-1-training-log.jsonl + retention-days: 1 + overwrite: true - - name: Upload training logs + - name: Upload training logs Phase 2 uses: actions/upload-artifact@v4 with: - name: training-log.jsonl - path: ./instructlab/training-log.jsonl + name: phase-2-training-log.jsonl + path: ./instructlab/phase-2-training-log.jsonl retention-days: 1 overwrite: true @@ -259,7 +275,7 @@ jobs: with: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ secrets.AWS_REGION }} + aws-region: ${{ vars.AWS_REGION }} - name: Stop EC2 runner uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7 @@ -269,36 +285,102 @@ jobs: label: ${{ needs.start-large-ec2-runner.outputs.label }} ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }} - - name: Download loss data - id: download-logs + loss-graphs: + needs: + - stop-large-ec2-runner + runs-on: ubuntu-latest + if: ${{ always() }} + steps: + - name: "Harden Runner" + # v2.10.1 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 + with: + egress-policy: audit + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ vars.AWS_REGION }} + + - name: Download loss data Phase 1 + id: phase-1-download-logs + uses: actions/download-artifact@v4 + with: + name: phase-1-training-log.jsonl + path: downloaded-data + + - name: Download loss data Phase 2 + id: phase-2-download-logs uses: actions/download-artifact@v4 with: - name: training-log.jsonl + name: phase-2-training-log.jsonl path: downloaded-data + - name: Checkout instructlab/training + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + repository: "instructlab/training" + path: "training" + fetch-depth: 0 + - name: Install dependencies + working-directory: ./training run: | + python -m pip install --upgrade pip pip install -r requirements-dev.txt - - - name: Try to upload to s3 - id: upload-s3 + + - name: Try to upload Phase 1 to s3 + id: phase-1-upload-s3 continue-on-error: true run: | - output_file='./test.md' - python scripts/create-loss-graph.py \ - --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \ - --output-file "${output_file}" \ + python training/scripts/create-loss-graph.py \ + --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \ + --output-file "./phase-1-test.md" \ + --phase "1" \ --aws-region "${{ vars.AWS_REGION }}" \ --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \ - --base-branch "${{ github.event.pull_request.base.ref }}" \ - --pr-number "${{ github.event.pull_request.number }}" \ - --head-sha "${{ github.event.pull_request.head.sha }}" \ + --base-branch "${GITHUB_REF##*/}" \ + --head-sha "${{ github.sha }}" \ + --pr-number "${{ github.event.number }}" \ --origin-repository "${{ github.repository }}" - - name: Check S3 upload status - if: steps.upload-s3.outcome == 'failure' + - name: Try to upload Phase 2 to s3 + id: phase-2-upload-s3 + continue-on-error: true run: | - echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate." + python training/scripts/create-loss-graph.py \ + --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \ + --output-file "./phase-2-test.md" \ + --phase "2" \ + --aws-region "${{ vars.AWS_REGION }}" \ + --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \ + --base-branch "${GITHUB_REF##*/}" \ + --head-sha "${{ github.sha }}" \ + --pr-number "${{ github.event.number }}" \ + --origin-repository "${{ github.repository }}" + + - name: Check Phase 1 S3 upload status for success + if: steps.phase-1-upload-s3.outcome == 'success' + run: | + echo "Uploaded Phase 1 loss graph to S3." + cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}" + + - name: Check Phase 2 S3 upload status for success + if: steps.phase-2-upload-s3.outcome == 'success' + run: | + echo "Uploaded Phase 2 loss graph to S3." + cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}" + + - name: Check Phase 1 S3 upload status for failure + if: steps.phase-1-upload-s3.outcome == 'failure' + run: | + echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate." + echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}" + + - name: Check Phase 2 S3 upload status for failure + if: steps.phase-2-upload-s3.outcome == 'failure' + run: | + echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate." echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}" - - cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}" \ No newline at end of file diff --git a/scripts/create-loss-graph.py b/scripts/create-loss-graph.py index e4ab1254..20821409 100644 --- a/scripts/create-loss-graph.py +++ b/scripts/create-loss-graph.py @@ -14,11 +14,13 @@ class Arguments(BaseModel): log_file: str | None = None output_file: str + phase: str | None = None + title: str | None = None aws_region: str bucket_name: str base_branch: str - pr_number: str head_sha: str + pr_number: str | None origin_repository: str @@ -76,21 +78,37 @@ def write_to_s3( ["aws", "s3", "cp", str(file), s3_path], capture_output=True, check=True ) if results.returncode != 0: - raise RuntimeError(f"failed to upload to s3: {results.stderr.decode('utf-8')}") + raise RuntimeError(f"Failed to upload to s3: {results.stderr.decode('utf-8')}") else: print(results.stdout.decode("utf-8")) -def get_destination_path(base_ref: str, pr_number: str, head_sha: str): - return f"pulls/{base_ref}/{pr_number}/{head_sha}/loss-graph.png" +def get_destination_path(base_ref: str, head_sha: str, phase: str | None): + if phase is None: + image_file_name = "loss-graph.png" + else: + image_file_name = f"loss-graph-{phase}.png" + return f"loss_graphs/{base_ref}/{head_sha}/{image_file_name}" def write_md_file( - output_file: Path, url: str, pr_number: str, head_sha: str, origin_repository: str + output_file: Path, + url: str, + head_sha: str, + origin_repository: str, + title: str, + pr_number: str | None, ): commit_url = f"https://github.com/{origin_repository}/commit/{head_sha}" + + if pr_number: + pr_url = f"https://github.com/{origin_repository}/pull/{pr_number}" + pr_str = f" ([PR {pr_number}]({pr_url}))" + else: + pr_str = "" + md_template = f""" -# Loss Graph for PR {args.pr_number} ([{args.head_sha[:7]}]({commit_url})) +# {title} ([{head_sha[:7]}]({commit_url})){pr_str} ![Loss Graph]({url}) """ @@ -107,9 +125,16 @@ def main(args: Arguments): loss_data = read_loss_data(log_file=log_file) output_image = Path("/tmp/loss-graph.png") output_file = Path(args.output_file) + title = args.title + if not title: + if args.phase is None: + phase_str = "" + else: + phase_str = f" for Phase {args.phase}" + title = f"Training Loss Graph{phase_str}" render_image(loss_data=loss_data, outfile=output_image) destination_path = get_destination_path( - base_ref=args.base_branch, pr_number=args.pr_number, head_sha=args.head_sha + base_ref=args.base_branch, head_sha=args.head_sha, phase=args.phase ) write_to_s3( file=output_image, bucket_name=args.bucket_name, destination=destination_path @@ -122,9 +147,10 @@ def main(args: Arguments): write_md_file( output_file=output_file, url=s3_url, - pr_number=args.pr_number, head_sha=args.head_sha, origin_repository=args.origin_repository, + title=title, + pr_number=args.pr_number, ) print(f"Loss graph uploaded to '{s3_url}'") print(f"Markdown file written to '{output_file}'") @@ -145,6 +171,16 @@ def main(args: Arguments): required=True, help="The output file where the resulting markdown will be written.", ) + parser.add_argument( + "--phase", + type=str, + help="Phase of the loss graph to use for storage and within the title (if not specified)", + ) + parser.add_argument( + "--title", + type=str, + help="Title of the loss graph to use in the markdown output", + ) parser.add_argument( "--aws-region", type=str, @@ -160,10 +196,10 @@ def main(args: Arguments): required=True, help="The base branch being merged to.", ) - parser.add_argument("--pr-number", type=str, required=True, help="The PR number") parser.add_argument( "--head-sha", type=str, required=True, help="The head SHA of the PR" ) + parser.add_argument("--pr-number", type=str, help="The PR number if applicable") parser.add_argument( "--origin-repository", type=str, @@ -176,11 +212,13 @@ def main(args: Arguments): arguments = Arguments( log_file=args.log_file, output_file=args.output_file, + phase=args.phase, + title=args.title, aws_region=args.aws_region, bucket_name=args.bucket_name, base_branch=args.base_branch, - pr_number=args.pr_number, head_sha=args.head_sha, + pr_number=args.pr_number, origin_repository=args.origin_repository, ) main(arguments)