Skip to content

Commit

Permalink
Merge pull request #356 from alimaredia/fix-loss-graphs-ci
Browse files Browse the repository at this point in the history
ci: Upload phase 1 & phase 2 training logs for loss graphs
  • Loading branch information
mergify[bot] authored Nov 29, 2024
2 parents 84c0f72 + 69fdaee commit 9e3b74a
Show file tree
Hide file tree
Showing 3 changed files with 158 additions and 38 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/e2e-nvidia-l4-x1.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ jobs:
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION }}
aws-region: ${{ vars.AWS_REGION }}

- name: Start EC2 runner
id: start-ec2-runner
Expand Down Expand Up @@ -187,7 +187,7 @@ jobs:
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION }}
aws-region: ${{ vars.AWS_REGION }}

- name: Stop EC2 runner
uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
Expand Down
134 changes: 108 additions & 26 deletions .github/workflows/e2e-nvidia-l40s-x4.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ jobs:
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION }}
aws-region: ${{ vars.AWS_REGION }}

- name: Start EC2 runner
id: start-ec2-runner
Expand Down Expand Up @@ -171,7 +171,7 @@ jobs:
pip install .
pip install .[cuda]
- name: Check disk
- name: Check disk before tests
run: |
df -h
Expand All @@ -188,14 +188,30 @@ jobs:
# we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
# and we know that it will be written into a directory created by `mktemp -d`.
# Given this information, we can use the following command to find the file:
log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl")
mv "${log_file}" training-log.jsonl
log_files=$(find /tmp/ -name "training_params_and_metrics_global0.jsonl")
phase_num=1;
for log_file in $log_files; do
mv "${log_file}" phase-${phase_num}-training-log.jsonl
((phase_num++))
done
- name: Check disk after tests
run: |
df -h
- name: Upload training logs Phase 1
uses: actions/upload-artifact@v4
with:
name: phase-1-training-log.jsonl
path: ./instructlab/phase-1-training-log.jsonl
retention-days: 1
overwrite: true

- name: Upload training logs
- name: Upload training logs Phase 2
uses: actions/upload-artifact@v4
with:
name: training-log.jsonl
path: ./instructlab/training-log.jsonl
name: phase-2-training-log.jsonl
path: ./instructlab/phase-2-training-log.jsonl
retention-days: 1
overwrite: true

Expand Down Expand Up @@ -259,7 +275,7 @@ jobs:
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION }}
aws-region: ${{ vars.AWS_REGION }}

- name: Stop EC2 runner
uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
Expand All @@ -269,36 +285,102 @@ jobs:
label: ${{ needs.start-large-ec2-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}

- name: Download loss data
id: download-logs
loss-graphs:
needs:
- stop-large-ec2-runner
runs-on: ubuntu-latest
if: ${{ always() }}
steps:
- name: "Harden Runner"
# v2.10.1
uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7
with:
egress-policy: audit

- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ vars.AWS_REGION }}

- name: Download loss data Phase 1
id: phase-1-download-logs
uses: actions/download-artifact@v4
with:
name: phase-1-training-log.jsonl
path: downloaded-data

- name: Download loss data Phase 2
id: phase-2-download-logs
uses: actions/download-artifact@v4
with:
name: training-log.jsonl
name: phase-2-training-log.jsonl
path: downloaded-data

- name: Checkout instructlab/training
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "instructlab/training"
path: "training"
fetch-depth: 0

- name: Install dependencies
working-directory: ./training
run: |
python -m pip install --upgrade pip
pip install -r requirements-dev.txt
- name: Try to upload to s3
id: upload-s3
- name: Try to upload Phase 1 to s3
id: phase-1-upload-s3
continue-on-error: true
run: |
output_file='./test.md'
python scripts/create-loss-graph.py \
--log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
--output-file "${output_file}" \
python training/scripts/create-loss-graph.py \
--log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
--output-file "./phase-1-test.md" \
--phase "1" \
--aws-region "${{ vars.AWS_REGION }}" \
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
--base-branch "${{ github.event.pull_request.base.ref }}" \
--pr-number "${{ github.event.pull_request.number }}" \
--head-sha "${{ github.event.pull_request.head.sha }}" \
--base-branch "${GITHUB_REF##*/}" \
--head-sha "${{ github.sha }}" \
--pr-number "${{ github.event.number }}" \
--origin-repository "${{ github.repository }}"
- name: Check S3 upload status
if: steps.upload-s3.outcome == 'failure'
- name: Try to upload Phase 2 to s3
id: phase-2-upload-s3
continue-on-error: true
run: |
echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate."
python training/scripts/create-loss-graph.py \
--log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
--output-file "./phase-2-test.md" \
--phase "2" \
--aws-region "${{ vars.AWS_REGION }}" \
--bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
--base-branch "${GITHUB_REF##*/}" \
--head-sha "${{ github.sha }}" \
--pr-number "${{ github.event.number }}" \
--origin-repository "${{ github.repository }}"
- name: Check Phase 1 S3 upload status for success
if: steps.phase-1-upload-s3.outcome == 'success'
run: |
echo "Uploaded Phase 1 loss graph to S3."
cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"
- name: Check Phase 2 S3 upload status for success
if: steps.phase-2-upload-s3.outcome == 'success'
run: |
echo "Uploaded Phase 2 loss graph to S3."
cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"
- name: Check Phase 1 S3 upload status for failure
if: steps.phase-1-upload-s3.outcome == 'failure'
run: |
echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
- name: Check Phase 2 S3 upload status for failure
if: steps.phase-2-upload-s3.outcome == 'failure'
run: |
echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
58 changes: 48 additions & 10 deletions scripts/create-loss-graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@
class Arguments(BaseModel):
log_file: str | None = None
output_file: str
phase: str | None = None
title: str | None = None
aws_region: str
bucket_name: str
base_branch: str
pr_number: str
head_sha: str
pr_number: str | None
origin_repository: str


Expand Down Expand Up @@ -76,21 +78,37 @@ def write_to_s3(
["aws", "s3", "cp", str(file), s3_path], capture_output=True, check=True
)
if results.returncode != 0:
raise RuntimeError(f"failed to upload to s3: {results.stderr.decode('utf-8')}")
raise RuntimeError(f"Failed to upload to s3: {results.stderr.decode('utf-8')}")
else:
print(results.stdout.decode("utf-8"))


def get_destination_path(base_ref: str, pr_number: str, head_sha: str):
return f"pulls/{base_ref}/{pr_number}/{head_sha}/loss-graph.png"
def get_destination_path(base_ref: str, head_sha: str, phase: str | None):
if phase is None:
image_file_name = "loss-graph.png"
else:
image_file_name = f"loss-graph-{phase}.png"
return f"loss_graphs/{base_ref}/{head_sha}/{image_file_name}"


def write_md_file(
output_file: Path, url: str, pr_number: str, head_sha: str, origin_repository: str
output_file: Path,
url: str,
head_sha: str,
origin_repository: str,
title: str,
pr_number: str | None,
):
commit_url = f"https://github.com/{origin_repository}/commit/{head_sha}"

if pr_number:
pr_url = f"https://github.com/{origin_repository}/pull/{pr_number}"
pr_str = f" ([PR {pr_number}]({pr_url}))"
else:
pr_str = ""

md_template = f"""
# Loss Graph for PR {args.pr_number} ([{args.head_sha[:7]}]({commit_url}))
# {title} ([{head_sha[:7]}]({commit_url})){pr_str}
![Loss Graph]({url})
"""
Expand All @@ -107,9 +125,16 @@ def main(args: Arguments):
loss_data = read_loss_data(log_file=log_file)
output_image = Path("/tmp/loss-graph.png")
output_file = Path(args.output_file)
title = args.title
if not title:
if args.phase is None:
phase_str = ""
else:
phase_str = f" for Phase {args.phase}"
title = f"Training Loss Graph{phase_str}"
render_image(loss_data=loss_data, outfile=output_image)
destination_path = get_destination_path(
base_ref=args.base_branch, pr_number=args.pr_number, head_sha=args.head_sha
base_ref=args.base_branch, head_sha=args.head_sha, phase=args.phase
)
write_to_s3(
file=output_image, bucket_name=args.bucket_name, destination=destination_path
Expand All @@ -122,9 +147,10 @@ def main(args: Arguments):
write_md_file(
output_file=output_file,
url=s3_url,
pr_number=args.pr_number,
head_sha=args.head_sha,
origin_repository=args.origin_repository,
title=title,
pr_number=args.pr_number,
)
print(f"Loss graph uploaded to '{s3_url}'")
print(f"Markdown file written to '{output_file}'")
Expand All @@ -145,6 +171,16 @@ def main(args: Arguments):
required=True,
help="The output file where the resulting markdown will be written.",
)
parser.add_argument(
"--phase",
type=str,
help="Phase of the loss graph to use for storage and within the title (if not specified)",
)
parser.add_argument(
"--title",
type=str,
help="Title of the loss graph to use in the markdown output",
)
parser.add_argument(
"--aws-region",
type=str,
Expand All @@ -160,10 +196,10 @@ def main(args: Arguments):
required=True,
help="The base branch being merged to.",
)
parser.add_argument("--pr-number", type=str, required=True, help="The PR number")
parser.add_argument(
"--head-sha", type=str, required=True, help="The head SHA of the PR"
)
parser.add_argument("--pr-number", type=str, help="The PR number if applicable")
parser.add_argument(
"--origin-repository",
type=str,
Expand All @@ -176,11 +212,13 @@ def main(args: Arguments):
arguments = Arguments(
log_file=args.log_file,
output_file=args.output_file,
phase=args.phase,
title=args.title,
aws_region=args.aws_region,
bucket_name=args.bucket_name,
base_branch=args.base_branch,
pr_number=args.pr_number,
head_sha=args.head_sha,
pr_number=args.pr_number,
origin_repository=args.origin_repository,
)
main(arguments)

0 comments on commit 9e3b74a

Please sign in to comment.