Merge branch 'instructlab:main' into feature/add-try-catch-import-to-…

…deepspeed
instructlab · Oct 30, 2024 · e045f2d · e045f2d
2 parents b5bc651 + 45e8a9e
commit e045f2d
Show file tree

Hide file tree

Showing 16 changed files with 730 additions and 130 deletions.
diff --git a/.github/mergify.yml b/.github/mergify.yml
@@ -25,21 +25,21 @@ pull_request_rules:
         - -files~=^\.github/(actions|workflows)/.*\.ya?ml$
         - -files~=^\.github/workflows/actionlint\.
 
-    # e2e workflow
+    # e2e medium workflow
     - or:
       - and:
-        # note this should match the triggering criteria in 'e2e-nvidia-a10g-x1.yml'
-        - check-success=e2e-medium-workflow-complete
+        # note this should match the triggering criteria in 'e2e-nvidia-l4-x1.yml'
+        - check-success~=e2e-medium-workflow-complete
         - or:
           - files~=\.py$
           - files=pyproject.toml
           - files~=^requirements.*\.txt$
-          - files=.github/workflows/e2e-nvidia-a10g-x1.yml
+          - files=.github/workflows/e2e-nvidia-l4-x1.yml
       - and:
         - -files~=\.py$
         - -files=pyproject.toml
         - -files~=^requirements.*\.txt$
-        - -files=.github/workflows/e2e-nvidia-a10g-x1.yml
+        - -files=.github/workflows/e2e-nvidia-l4-x1.yml
 
     # code lint workflow
     - or:

diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
@@ -35,7 +35,7 @@ jobs:
           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
       - name: "Checkout"
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           fetch-depth: 0
 

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -37,7 +37,7 @@ jobs:
         with:
           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
       - name: "Checkout"
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           fetch-depth: 0
       - name: "Check Markdown documents"

diff --git a/.github/workflows/e2e-nvidia-a10g-x1.yml → .github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-a10g-x1.yml → .github/workflows/e2e-nvidia-l4-x1.yml
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-name: E2E (NVIDIA A10G x1)
+name: E2E (NVIDIA L4 x1)
 
 on:
   # run against every merge commit to 'main' and release branches
@@ -18,7 +18,7 @@ on:
       - '**.py'
       - 'pyproject.toml'
       - 'requirements**.txt'
-      - '.github/workflows/e2e-nvidia-a10g-x1.yml' # This workflow
+      - '.github/workflows/e2e-nvidia-l4-x1.yml' # This workflow
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -54,8 +54,8 @@ jobs:
         with:
           mode: start
           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ami-01a89eee1adde309c
-          ec2-instance-type: g5.4xlarge
+          ec2-image-id: ${{ vars.AWS_EC2_AMI }}
+          ec2-instance-type: g6.8xlarge
           subnet-id: subnet-02d230cffd9385bd4
           security-group-id: sg-06300447c4a5fbef3
           iam-role-name: instructlab-ci-runner
@@ -84,15 +84,15 @@ jobs:
           sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
 
       - name: Checkout instructlab/instructlab
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           repository: "instructlab/instructlab"
           path: "instructlab"
           # https://github.com/actions/checkout/issues/249
           fetch-depth: 0
 
       - name: Checkout instructlab/training
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           repository: "instructlab/training"
           path: "training"
@@ -117,19 +117,19 @@ jobs:
           nvidia-smi
           python3.11 -m pip cache remove llama_cpp_python
 
-          CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install .
+          CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install -v .
 
           # https://github.com/instructlab/instructlab/issues/1821
           # install with Torch and build dependencies installed
-          python3.11 -m pip install packaging wheel setuptools-scm
-          python3.11 -m pip install .[cuda] -r requirements-vllm-cuda.txt
+          python3.11 -m pip install -v packaging wheel setuptools-scm
+          python3.11 -m pip install -v .[cuda] -r requirements-vllm-cuda.txt
 
       - name: Update instructlab-training library
         working-directory: ./training
         run: |
           . ../instructlab/venv/bin/activate
-          pip install .
-          pip install .[cuda]
+          pip install -v .
+          pip install -v .[cuda]
 
       - name: Check disk
         run: |
@@ -144,7 +144,6 @@ jobs:
           ./scripts/e2e-ci.sh -m
 
   stop-medium-ec2-runner:
-    name: Stop external EC2 runner
     needs:
       - start-medium-ec2-runner
       - e2e-medium-test

diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml
@@ -0,0 +1,237 @@
+# SPDX-License-Identifier: Apache-2.0
+
+name: E2E (NVIDIA L40S x4)
+
+on:
+  schedule:
+    - cron: '0 16 * * *' # Runs at 4PM UTC every day
+  workflow_dispatch:
+    inputs:
+      pr_or_branch:
+        description: 'pull request number or branch name'
+        required: true
+        default: 'main'
+
+jobs:
+  start-large-ec2-runner:
+    runs-on: ubuntu-latest
+    outputs:
+      label: ${{ steps.start-ec2-runner.outputs.label }}
+      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+
+      - name: Start EC2 runner
+        id: start-ec2-runner
+        uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
+        with:
+          mode: start
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          ec2-image-id: ${{ vars.AWS_EC2_AMI }}
+          ec2-instance-type: g6e.12xlarge
+          subnet-id: subnet-024298cefa3bedd61
+          security-group-id: sg-06300447c4a5fbef3
+          iam-role-name: instructlab-ci-runner
+          aws-resource-tags: >
+            [
+              {"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
+              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
+              {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
+              {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
+            ]
+
+  e2e-large-test:
+    needs:
+      - start-large-ec2-runner
+    runs-on: ${{ needs.start-large-ec2-runner.outputs.label }}
+
+    permissions:
+      pull-requests: write
+
+    steps:
+      - name: Install Packages
+        run: |
+          cat /etc/os-release
+          sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
+
+      - name: Checkout instructlab/instructlab
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: "instructlab/instructlab"
+          path: "instructlab"
+          # https://github.com/actions/checkout/issues/249
+          fetch-depth: 0
+
+      - name: Checkout instructlab/training
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: "instructlab/training"
+          path: "training"
+          # https://github.com/actions/checkout/issues/249
+          fetch-depth: 0
+
+      - name: Determine if pr_or_branch is a PR number
+        id: check_pr
+        run: |
+          PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set
+          if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
+            echo "is_pr=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "is_pr=false" >> "$GITHUB_OUTPUT"
+          fi
+          echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT"
+
+      - name: Check if gh cli is installed
+        id: gh_cli
+        run: |
+          if command -v gh &> /dev/null ; then
+            echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Install gh CLI
+        if: steps.gh_cli.outputs.gh_cli_installed == 'false'
+        run: |
+          sudo dnf install 'dnf-command(config-manager)' -y
+          sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
+          sudo dnf install gh --repo gh-cli -y
+
+      - name: test gh CLI
+        run: |
+          gh --version
+
+      - name: set default repo
+        working-directory: ./training
+        run: |
+          gh repo set-default ${{ github.server_url }}/${{ github.repository }}
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Add comment to PR
+        if: steps.check_pr.outputs.is_pr == 'true'
+        working-directory: ./training
+        run: |
+          gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Fetch and checkout PR
+        if: steps.check_pr.outputs.is_pr == 'true'
+        working-directory: ./training
+        run: |
+          gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }}
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Checkout branch
+        if: steps.check_pr.outputs.is_pr == 'false'
+        working-directory: ./training
+        run: |
+          git checkout ${{ steps.check_pr.outputs.pr_or_branch }}
+
+      - name: Install ilab
+        working-directory: ./instructlab
+        run: |
+          export CUDA_HOME="/usr/local/cuda"
+          export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+          export PATH="$PATH:$CUDA_HOME/bin"
+          python3.11 -m venv --upgrade-deps venv
+          . venv/bin/activate
+          nvidia-smi
+          python3.11 -m pip cache remove llama_cpp_python
+
+          CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install .
+
+          # https://github.com/instructlab/instructlab/issues/1821
+          # install with Torch and build dependencies installed
+          python3.11 -m pip install packaging wheel setuptools-scm
+          python3.11 -m pip install .[cuda] -r requirements-vllm-cuda.txt
+
+      - name: Update instructlab-training library
+        working-directory: ./training
+        run: |
+          . ../instructlab/venv/bin/activate
+          pip install .
+          pip install .[cuda]
+
+      - name: Check disk
+        run: |
+          df -h
+
+      - name: Run e2e test
+        working-directory: ./instructlab
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          . venv/bin/activate
+          ./scripts/e2e-ci.sh -l
+
+      - name: Add comment to PR if the workflow failed
+        if: failure() && steps.check_pr.outputs.is_pr == 'true'
+        working-directory: ./training
+        run: |
+          gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Add comment to PR if the workflow succeeded
+        if: success() && steps.check_pr.outputs.is_pr == 'true'
+        working-directory: ./training
+        run: |
+          gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Post job results to Slack if the workflow failed
+        if: failure() && steps.check_pr.outputs.is_pr == 'false'
+        id: slack-report-failure
+        uses: slackapi/slack-github-action@37ebaef184d7626c5f204ab8d3baff4262dd30f0 # v1.27.0
+        with:
+          # Slack channel id, channel name, or user id to post message.
+          # See also: https://api.slack.com/methods/chat.postMessage#channels
+          # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs.
+          channel-id: 'e2e-ci-results'
+          slack-message: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *with failures* :meow_sad-rain: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SON_OF_JEEVES_TOKEN }}
+
+      - name: Post job results to Slack if the workflow succeeded
+        if: success() && steps.check_pr.outputs.is_pr == 'false'
+        id: slack-report-success
+        uses: slackapi/slack-github-action@37ebaef184d7626c5f204ab8d3baff4262dd30f0 # v1.27.0
+        with:
+          # Slack channel id, channel name, or user id to post message.
+          # See also: https://api.slack.com/methods/chat.postMessage#channels
+          # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs.
+          channel-id: 'e2e-ci-results'
+          slack-message: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *successfully* :meow_party: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SON_OF_JEEVES_TOKEN }}
+
+  stop-large-ec2-runner:
+    needs:
+      - start-large-ec2-runner
+      - e2e-large-test
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+
+      - name: Stop EC2 runner
+        uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
+        with:
+          mode: stop
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          label: ${{ needs.start-large-ec2-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -63,13 +63,13 @@ jobs:
           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
       - name: "Checkout"
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           # https://github.com/actions/checkout/issues/249
           fetch-depth: 0
 
       - name: Setup Python 3.11
-        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
         with:
           python-version: 3.11
           cache: pip

diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
@@ -42,7 +42,7 @@ jobs:
                   egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
             - name: "Checkout"
-              uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+              uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
               with:
                   # for setuptools-scm
                   fetch-depth: 0