Skip to content

Commit

Permalink
Merge branch 'instructlab:main' into feature/add-try-catch-import-to-…
Browse files Browse the repository at this point in the history
…deepspeed
  • Loading branch information
Harthi7 authored Oct 30, 2024
2 parents b5bc651 + 45e8a9e commit e045f2d
Show file tree
Hide file tree
Showing 16 changed files with 730 additions and 130 deletions.
10 changes: 5 additions & 5 deletions .github/mergify.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,21 @@ pull_request_rules:
- -files~=^\.github/(actions|workflows)/.*\.ya?ml$
- -files~=^\.github/workflows/actionlint\.

# e2e workflow
# e2e medium workflow
- or:
- and:
# note this should match the triggering criteria in 'e2e-nvidia-a10g-x1.yml'
- check-success=e2e-medium-workflow-complete
# note this should match the triggering criteria in 'e2e-nvidia-l4-x1.yml'
- check-success~=e2e-medium-workflow-complete
- or:
- files~=\.py$
- files=pyproject.toml
- files~=^requirements.*\.txt$
- files=.github/workflows/e2e-nvidia-a10g-x1.yml
- files=.github/workflows/e2e-nvidia-l4-x1.yml
- and:
- -files~=\.py$
- -files=pyproject.toml
- -files~=^requirements.*\.txt$
- -files=.github/workflows/e2e-nvidia-a10g-x1.yml
- -files=.github/workflows/e2e-nvidia-l4-x1.yml

# code lint workflow
- or:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/actionlint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ jobs:
egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs

- name: "Checkout"
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
with:
egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
- name: "Checkout"
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: "Check Markdown documents"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0

name: E2E (NVIDIA A10G x1)
name: E2E (NVIDIA L4 x1)

on:
# run against every merge commit to 'main' and release branches
Expand All @@ -18,7 +18,7 @@ on:
- '**.py'
- 'pyproject.toml'
- 'requirements**.txt'
- '.github/workflows/e2e-nvidia-a10g-x1.yml' # This workflow
- '.github/workflows/e2e-nvidia-l4-x1.yml' # This workflow

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
Expand Down Expand Up @@ -54,8 +54,8 @@ jobs:
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ami-01a89eee1adde309c
ec2-instance-type: g5.4xlarge
ec2-image-id: ${{ vars.AWS_EC2_AMI }}
ec2-instance-type: g6.8xlarge
subnet-id: subnet-02d230cffd9385bd4
security-group-id: sg-06300447c4a5fbef3
iam-role-name: instructlab-ci-runner
Expand Down Expand Up @@ -84,15 +84,15 @@ jobs:
sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
- name: Checkout instructlab/instructlab
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "instructlab/instructlab"
path: "instructlab"
# https://github.com/actions/checkout/issues/249
fetch-depth: 0

- name: Checkout instructlab/training
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "instructlab/training"
path: "training"
Expand All @@ -117,19 +117,19 @@ jobs:
nvidia-smi
python3.11 -m pip cache remove llama_cpp_python
CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install .
CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install -v .
# https://github.com/instructlab/instructlab/issues/1821
# install with Torch and build dependencies installed
python3.11 -m pip install packaging wheel setuptools-scm
python3.11 -m pip install .[cuda] -r requirements-vllm-cuda.txt
python3.11 -m pip install -v packaging wheel setuptools-scm
python3.11 -m pip install -v .[cuda] -r requirements-vllm-cuda.txt
- name: Update instructlab-training library
working-directory: ./training
run: |
. ../instructlab/venv/bin/activate
pip install .
pip install .[cuda]
pip install -v .
pip install -v .[cuda]
- name: Check disk
run: |
Expand All @@ -144,7 +144,6 @@ jobs:
./scripts/e2e-ci.sh -m
stop-medium-ec2-runner:
name: Stop external EC2 runner
needs:
- start-medium-ec2-runner
- e2e-medium-test
Expand Down
237 changes: 237 additions & 0 deletions .github/workflows/e2e-nvidia-l40s-x4.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
# SPDX-License-Identifier: Apache-2.0

name: E2E (NVIDIA L40S x4)

on:
schedule:
- cron: '0 16 * * *' # Runs at 4PM UTC every day
workflow_dispatch:
inputs:
pr_or_branch:
description: 'pull request number or branch name'
required: true
default: 'main'

jobs:
start-large-ec2-runner:
runs-on: ubuntu-latest
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION }}

- name: Start EC2 runner
id: start-ec2-runner
uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ${{ vars.AWS_EC2_AMI }}
ec2-instance-type: g6e.12xlarge
subnet-id: subnet-024298cefa3bedd61
security-group-id: sg-06300447c4a5fbef3
iam-role-name: instructlab-ci-runner
aws-resource-tags: >
[
{"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
]
e2e-large-test:
needs:
- start-large-ec2-runner
runs-on: ${{ needs.start-large-ec2-runner.outputs.label }}

permissions:
pull-requests: write

steps:
- name: Install Packages
run: |
cat /etc/os-release
sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
- name: Checkout instructlab/instructlab
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "instructlab/instructlab"
path: "instructlab"
# https://github.com/actions/checkout/issues/249
fetch-depth: 0

- name: Checkout instructlab/training
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "instructlab/training"
path: "training"
# https://github.com/actions/checkout/issues/249
fetch-depth: 0

- name: Determine if pr_or_branch is a PR number
id: check_pr
run: |
PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set
if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
echo "is_pr=true" >> "$GITHUB_OUTPUT"
else
echo "is_pr=false" >> "$GITHUB_OUTPUT"
fi
echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT"
- name: Check if gh cli is installed
id: gh_cli
run: |
if command -v gh &> /dev/null ; then
echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT"
else
echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT"
fi
- name: Install gh CLI
if: steps.gh_cli.outputs.gh_cli_installed == 'false'
run: |
sudo dnf install 'dnf-command(config-manager)' -y
sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
sudo dnf install gh --repo gh-cli -y
- name: test gh CLI
run: |
gh --version
- name: set default repo
working-directory: ./training
run: |
gh repo set-default ${{ github.server_url }}/${{ github.repository }}
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

- name: Add comment to PR
if: steps.check_pr.outputs.is_pr == 'true'
working-directory: ./training
run: |
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

- name: Fetch and checkout PR
if: steps.check_pr.outputs.is_pr == 'true'
working-directory: ./training
run: |
gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }}
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

- name: Checkout branch
if: steps.check_pr.outputs.is_pr == 'false'
working-directory: ./training
run: |
git checkout ${{ steps.check_pr.outputs.pr_or_branch }}
- name: Install ilab
working-directory: ./instructlab
run: |
export CUDA_HOME="/usr/local/cuda"
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
export PATH="$PATH:$CUDA_HOME/bin"
python3.11 -m venv --upgrade-deps venv
. venv/bin/activate
nvidia-smi
python3.11 -m pip cache remove llama_cpp_python
CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install .
# https://github.com/instructlab/instructlab/issues/1821
# install with Torch and build dependencies installed
python3.11 -m pip install packaging wheel setuptools-scm
python3.11 -m pip install .[cuda] -r requirements-vllm-cuda.txt
- name: Update instructlab-training library
working-directory: ./training
run: |
. ../instructlab/venv/bin/activate
pip install .
pip install .[cuda]
- name: Check disk
run: |
df -h
- name: Run e2e test
working-directory: ./instructlab
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
. venv/bin/activate
./scripts/e2e-ci.sh -l
- name: Add comment to PR if the workflow failed
if: failure() && steps.check_pr.outputs.is_pr == 'true'
working-directory: ./training
run: |
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

- name: Add comment to PR if the workflow succeeded
if: success() && steps.check_pr.outputs.is_pr == 'true'
working-directory: ./training
run: |
gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

- name: Post job results to Slack if the workflow failed
if: failure() && steps.check_pr.outputs.is_pr == 'false'
id: slack-report-failure
uses: slackapi/slack-github-action@37ebaef184d7626c5f204ab8d3baff4262dd30f0 # v1.27.0
with:
# Slack channel id, channel name, or user id to post message.
# See also: https://api.slack.com/methods/chat.postMessage#channels
# You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs.
channel-id: 'e2e-ci-results'
slack-message: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *with failures* :meow_sad-rain: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SON_OF_JEEVES_TOKEN }}

- name: Post job results to Slack if the workflow succeeded
if: success() && steps.check_pr.outputs.is_pr == 'false'
id: slack-report-success
uses: slackapi/slack-github-action@37ebaef184d7626c5f204ab8d3baff4262dd30f0 # v1.27.0
with:
# Slack channel id, channel name, or user id to post message.
# See also: https://api.slack.com/methods/chat.postMessage#channels
# You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs.
channel-id: 'e2e-ci-results'
slack-message: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *successfully* :meow_party: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SON_OF_JEEVES_TOKEN }}

stop-large-ec2-runner:
needs:
- start-large-ec2-runner
- e2e-large-test
runs-on: ubuntu-latest
if: ${{ always() }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION }}

- name: Stop EC2 runner
uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-large-ec2-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
4 changes: 2 additions & 2 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,13 @@ jobs:
egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs

- name: "Checkout"
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
# https://github.com/actions/checkout/issues/249
fetch-depth: 0

- name: Setup Python 3.11
uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: 3.11
cache: pip
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pypi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ jobs:
egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs

- name: "Checkout"
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
# for setuptools-scm
fetch-depth: 0
Expand Down
Loading

0 comments on commit e045f2d

Please sign in to comment.