From 644ac10c92c38bfbeb87ba5698084757a80408a5 Mon Sep 17 00:00:00 2001 From: Ruturaj4 Date: Wed, 31 Jul 2024 15:04:58 -0500 Subject: [PATCH 01/22] [ROCm] improve gpu script --- build/rocm/Dockerfile.ms | 3 +- build/rocm/run_single_gpu.py | 53 ++++++++++++++++++++++++++++++++---- 2 files changed, 49 insertions(+), 7 deletions(-) diff --git a/build/rocm/Dockerfile.ms b/build/rocm/Dockerfile.ms index 5f831f111b25..5fc0afa326af 100644 --- a/build/rocm/Dockerfile.ms +++ b/build/rocm/Dockerfile.ms @@ -32,6 +32,5 @@ RUN git clone https://github.com/pyenv/pyenv.git /pyenv ENV PYENV_ROOT /pyenv ENV PATH $PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH RUN pyenv install $PYTHON_VERSION -RUN eval "$(pyenv init -)" && pyenv local ${PYTHON_VERSION} && pip3 install --upgrade --force-reinstall setuptools pip && pip install numpy setuptools build wheel six auditwheel scipy pytest pytest-html pytest_html_merger pytest-reportlog pytest-rerunfailures cloudpickle portpicker matplotlib absl-py flatbuffers hypothesis - +RUN eval "$(pyenv init -)" && pyenv local ${PYTHON_VERSION} && pip3 install --upgrade --force-reinstall setuptools pip && pip install numpy setuptools build wheel six auditwheel scipy pytest pytest-html pytest_html_merger pytest-reportlog pytest-json-report pytest-csv pytest-rerunfailures cloudpickle portpicker matplotlib absl-py flatbuffers hypothesis diff --git a/build/rocm/run_single_gpu.py b/build/rocm/run_single_gpu.py index 4eedc8d4e2a5..4e7660ca1f15 100755 --- a/build/rocm/run_single_gpu.py +++ b/build/rocm/run_single_gpu.py @@ -14,6 +14,7 @@ # limitations under the License. import os +import csv import json import argparse import threading @@ -29,6 +30,34 @@ def extract_filename(path): file_name, _ = os.path.splitext(base_name) return file_name + +def combine_json_reports(): + all_json_files = [f for f in os.listdir(base_dir) if f.endswith('_log.json')] + combined_data = [] + for json_file in all_json_files: + with open(os.path.join(base_dir, json_file), 'r') as infile: + data = json.load(infile) + combined_data.append(data) + combined_json_file = f"{base_dir}/final_compiled_report.json" + with open(combined_json_file, 'w') as outfile: + json.dump(combined_data, outfile, indent=4) + + +def combine_csv_reports(): + all_csv_files = [f for f in os.listdir(base_dir) if f.endswith('_log.csv')] + combined_csv_file = f"{base_dir}/final_compiled_report.csv" + with open(combined_csv_file, mode='w', newline='') as outfile: + csv_writer = csv.writer(outfile) + for i, csv_file in enumerate(all_csv_files): + with open(os.path.join(base_dir, csv_file), mode='r') as infile: + csv_reader = csv.reader(infile) + if i == 0: + # write headers only once + csv_writer.writerow(next(csv_reader)) + for row in csv_reader: + csv_writer.writerow(row) + + def generate_final_report(shell=False, env_vars={}): env = os.environ env = {**env, **env_vars} @@ -41,7 +70,10 @@ def generate_final_report(shell=False, env_vars={}): print("FAILED - {}".format(" ".join(cmd))) print(result.stderr.decode()) - return result.returncode, result.stderr.decode(), result.stdout.decode() + # Generate json reports. + combine_json_reports() + # Generate csv reports. + combine_csv_reports() def run_shell_command(cmd, shell=False, env_vars={}): @@ -66,7 +98,7 @@ def parse_test_log(log_file): report = json.loads(line) if "nodeid" in report: module = report["nodeid"].split("::")[0] - if module: + if module and ".py" in module: test_files.add(os.path.abspath(module)) return test_files @@ -100,9 +132,20 @@ def run_test(testmodule, gpu_tokens, continue_on_fail): } testfile = extract_filename(testmodule) if continue_on_fail: - cmd = ["python3", "-m", "pytest", '--html={}/{}_log.html'.format(base_dir, testfile), "--reruns", "3", "-v", testmodule] + cmd = ["python3", "-m", "pytest", + "--json-report", f"--json-report-file={base_dir}/{testfile}_log.json", + f"--csv={base_dir}/{testfile}_log.csv", + "--csv-columns", "id,module,name,file,status,duration", + f"--html={base_dir}/{testfile}_log.html", + "--reruns", "3", "-v", testmodule] else: - cmd = ["python3", "-m", "pytest", '--html={}/{}_log.html'.format(base_dir, testfile), "--reruns", "3", "-x", "-v", testmodule] + cmd = ["python3", "-m", "pytest", + "--json-report", f"--json-report-file={base_dir}/{testfile}_log.json", + f"--csv={base_dir}/{testfile}_log.csv", + "--csv-columns", "id,module,name,file,status,duration", + f"--html={base_dir}/{testfile}_log.html", + "--reruns", "3", "-x", "-v", testmodule] + return_code, stderr, stdout = run_shell_command(cmd, env_vars=env_vars) with GPU_LOCK: gpu_tokens.append(target_gpu) @@ -115,7 +158,7 @@ def run_test(testmodule, gpu_tokens, continue_on_fail): def run_parallel(all_testmodules, p, c): - print(f"Running tests with parallelism=", p) + print(f"Running tests with parallelism = {p}") available_gpu_tokens = list(range(p)) executor = ThreadPoolExecutor(max_workers=p) # walking through test modules. From 109259e0029ed5f3f58573ff5e1b0c1055285cb3 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Fri, 20 Sep 2024 21:43:49 +0000 Subject: [PATCH 02/22] Add rocm-ci.yaml --- .github/workflows/rocm-ci.yaml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 .github/workflows/rocm-ci.yaml diff --git a/.github/workflows/rocm-ci.yaml b/.github/workflows/rocm-ci.yaml new file mode 100644 index 000000000000..55d329d7cb5f --- /dev/null +++ b/.github/workflows/rocm-ci.yaml @@ -0,0 +1,23 @@ +name: Print Environment Variables + +on: [push] + +jobs: + print-env: + runs-on: mi-250 + steps: + - name: Print Environment Variables + run: printenv + + - name: Set up Docker + uses: docker/setup-buildx-action@v2 + + - name: Try rocm-smi + run: | + docker run \ + --group-add video \ + $RENDER_DEVICES \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --shm-size=64G \ + rocm/rocm-terminal rocm-smi \ No newline at end of file From 44a496be0902862076c100a6e68e64f2a7767c89 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Mon, 23 Sep 2024 15:57:20 +0000 Subject: [PATCH 03/22] add rocm-ci file --- .github/workflows/ci-build.yaml | 5 +++++ .github/workflows/rocm-ci.yaml | 28 +++++++++++++++------------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/.github/workflows/ci-build.yaml b/.github/workflows/ci-build.yaml index 5d46f8fbf0d8..c01d44af9cf6 100644 --- a/.github/workflows/ci-build.yaml +++ b/.github/workflows/ci-build.yaml @@ -22,6 +22,7 @@ permissions: jobs: lint_and_typecheck: + if: false runs-on: ubuntu-latest timeout-minutes: 5 steps: @@ -38,6 +39,7 @@ jobs: - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # ratchet: pre-commit/action@v3.0.1 build: + if: false name: "build ${{ matrix.name-prefix }} (py ${{ matrix.python-version }} on ${{ matrix.os }}, x64=${{ matrix.enable-x64}})" runs-on: ${{ matrix.os }} timeout-minutes: 60 @@ -103,6 +105,7 @@ jobs: documentation: + if: false name: Documentation - test code snippets runs-on: ubuntu-latest timeout-minutes: 10 @@ -145,6 +148,7 @@ jobs: documentation_render: + if: false name: Documentation - render documentation runs-on: ubuntu-latest timeout-minutes: 10 @@ -181,6 +185,7 @@ jobs: jax2tf_test: + if: false name: "jax2tf_test (py ${{ matrix.python-version }} on ${{ matrix.os }}, x64=${{ matrix.enable-x64}})" runs-on: ${{ matrix.os }} timeout-minutes: 30 diff --git a/.github/workflows/rocm-ci.yaml b/.github/workflows/rocm-ci.yaml index 55d329d7cb5f..380ee153250b 100644 --- a/.github/workflows/rocm-ci.yaml +++ b/.github/workflows/rocm-ci.yaml @@ -1,4 +1,4 @@ -name: Print Environment Variables +name: ROCM CI on: [push] @@ -8,16 +8,18 @@ jobs: steps: - name: Print Environment Variables run: printenv + + - - name: Set up Docker - uses: docker/setup-buildx-action@v2 - - - name: Try rocm-smi - run: | - docker run \ - --group-add video \ - $RENDER_DEVICES \ - --cap-add=SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --shm-size=64G \ - rocm/rocm-terminal rocm-smi \ No newline at end of file +# - name: Set up Docker +# uses: docker/setup-buildx-action@v2 +# +# - name: Try rocm-smi +# run: | +# docker run \ +# --group-add video \ +# $RENDER_DEVICES \ +# --cap-add=SYS_PTRACE \ +# --security-opt seccomp=unconfined \ +# --shm-size=64G \ +# rocm/rocm-terminal rocm-smi \ No newline at end of file From d41c479b427910bef4547f75d10baea090b9d456 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Fri, 27 Sep 2024 18:06:27 +0000 Subject: [PATCH 04/22] Build doker in workflow --- .github/workflows/ci-build.yaml | 4 ++-- .github/workflows/rocm-ci.yaml | 21 ++++++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci-build.yaml b/.github/workflows/ci-build.yaml index a2e45c1a8fc6..eb7e9a20046d 100644 --- a/.github/workflows/ci-build.yaml +++ b/.github/workflows/ci-build.yaml @@ -48,13 +48,13 @@ jobs: include: - name-prefix: "with 3.10" python-version: "3.10" - os: ubuntu-20.04-16core + os: ubuntu-20.04-16core # update to custom rocm runner enable-x64: 1 prng-upgrade: 1 num_generated_cases: 1 - name-prefix: "with 3.12" python-version: "3.12" - os: ubuntu-20.04-16core + os: ubuntu-20.04-16core # Update to customer rocm runner enable-x64: 0 prng-upgrade: 0 num_generated_cases: 1 diff --git a/.github/workflows/rocm-ci.yaml b/.github/workflows/rocm-ci.yaml index 380ee153250b..32ffb6be4048 100644 --- a/.github/workflows/rocm-ci.yaml +++ b/.github/workflows/rocm-ci.yaml @@ -3,17 +3,24 @@ name: ROCM CI on: [push] jobs: - print-env: - runs-on: mi-250 + build-docker: # strategy and matrix come here + runs-on: mi-250 steps: - name: Print Environment Variables run: printenv + - name: Set up Docker + uses: docker/setup-buildx-action@v2 + - uses: actions/checkout@v4 + - name: Build Docker + env: + BUILD_TAG: rocm_jax_r6_1_3_py3_10_id${GITHUB_RUN_ID} + # XLA_CLONE_DIR: + run: | + ./build/rocm/ci_build.sh --rocm_version 6.1.3 \ + --keep_image --py_version 3.10 \ + --runtime bash -c "./build/rocm/build_rocm.sh" - - -# - name: Set up Docker -# uses: docker/setup-buildx-action@v2 -# + # - name: Try rocm-smi # run: | # docker run \ From 89dfac4b2a081389099d86a7d8dac506b7417650 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Fri, 27 Sep 2024 20:45:21 +0000 Subject: [PATCH 05/22] remove redundant step --- .github/workflows/rocm-ci.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/rocm-ci.yaml b/.github/workflows/rocm-ci.yaml index 32ffb6be4048..dfd2714be954 100644 --- a/.github/workflows/rocm-ci.yaml +++ b/.github/workflows/rocm-ci.yaml @@ -8,8 +8,6 @@ jobs: steps: - name: Print Environment Variables run: printenv - - name: Set up Docker - uses: docker/setup-buildx-action@v2 - uses: actions/checkout@v4 - name: Build Docker env: From 74e60252e8cdb07b32978ca1a819cc1fbb2b5cb0 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Fri, 27 Sep 2024 21:08:06 +0000 Subject: [PATCH 06/22] fix the pwd issue in docker, string fails to decode --- build/rocm/ci_build | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/build/rocm/ci_build b/build/rocm/ci_build index aeb0201e27ed..3c051be8c027 100755 --- a/build/rocm/ci_build +++ b/build/rocm/ci_build @@ -83,11 +83,15 @@ def dist_wheels( cmd = ["docker", "run"] + # docker run fails when mounting ./ + own_path = os.path.dirname(os.path.abspath(__file__)) + repo_path = os.path.abspath(os.path.join(own_path, "..","..")) + whl_path = os.path.join(repo_path, "wheelhouse") mounts = [ "-v", - "./:/jax", + "%s:/jax" % repo_path, "-v", - "./wheelhouse:/wheelhouse", + "%s:/wheelhouse" % whl_path, ] if xla_path: @@ -130,10 +134,16 @@ def _fetch_jax_metadata(xla_path): jax_version = subprocess.check_output(cmd, env=env) + def safe_decode(x): + if isinstance(x, str): + return x + else: + return x.decode("utf8") + return { - "jax_version": jax_version.decode("utf8").strip(), - "jax_commit": jax_commit.decode("utf8").strip(), - "xla_commit": xla_commit.decode("utf8").strip(), + "jax_version": safe_decode(jax_version).strip(), + "jax_commit": safe_decode(jax_commit).strip(), + "xla_commit": safe_decode(xla_commit).strip(), } @@ -204,9 +214,12 @@ def test(image_name): # NOTE(mrodden): we need jax source dir for the unit test code only, # JAX and jaxlib are already installed from wheels + # docker run fails when mounting ./ + own_path = os.path.dirname(os.path.abspath(__file__)) + repo_path = os.path.abspath(os.path.join(own_path, "..","..")) mounts = [ "-v", - "./:/jax", + "%s:/jax" % repo_path, ] cmd.extend(mounts) From 2928bb6b8391404a1d4acea9b951ac20538e37c5 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Fri, 27 Sep 2024 21:43:35 +0000 Subject: [PATCH 07/22] update amdgpu repo url to include version number --- build/rocm/tools/get_rocm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/build/rocm/tools/get_rocm.py b/build/rocm/tools/get_rocm.py index 5334bf40ece7..2bcae5f9064c 100644 --- a/build/rocm/tools/get_rocm.py +++ b/build/rocm/tools/get_rocm.py @@ -320,11 +320,12 @@ def setup_repos_el8(rocm_version_str): """ [amdgpu] name=amdgpu -baseurl=https://repo.radeon.com/amdgpu/latest/rhel/8.8/main/x86_64/ +baseurl=https://repo.radeon.com/amdgpu/%s/rhel/8.8/main/x86_64/ enabled=1 gpgcheck=1 gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key """ + % rocm_version_str ) From c5131001a649e2605fe7ba0ba82da86e08e7319e Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Mon, 30 Sep 2024 20:23:02 +0000 Subject: [PATCH 08/22] fix run id typo --- .github/workflows/rocm-ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/rocm-ci.yaml b/.github/workflows/rocm-ci.yaml index dfd2714be954..9ae2d8d3c36c 100644 --- a/.github/workflows/rocm-ci.yaml +++ b/.github/workflows/rocm-ci.yaml @@ -11,7 +11,7 @@ jobs: - uses: actions/checkout@v4 - name: Build Docker env: - BUILD_TAG: rocm_jax_r6_1_3_py3_10_id${GITHUB_RUN_ID} + BUILD_TAG: rocm_jax_r6_1_3_py3_10_id${{ github.run_id }} # XLA_CLONE_DIR: run: | ./build/rocm/ci_build.sh --rocm_version 6.1.3 \ From c8c858f0820a6369463634a3645921a6e0b4ff23 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Mon, 30 Sep 2024 22:17:56 +0000 Subject: [PATCH 09/22] add cleanup --- .github/workflows/rocm-ci.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/rocm-ci.yaml b/.github/workflows/rocm-ci.yaml index 9ae2d8d3c36c..57a99c378788 100644 --- a/.github/workflows/rocm-ci.yaml +++ b/.github/workflows/rocm-ci.yaml @@ -15,8 +15,10 @@ jobs: # XLA_CLONE_DIR: run: | ./build/rocm/ci_build.sh --rocm_version 6.1.3 \ - --keep_image --py_version 3.10 \ - --runtime bash -c "./build/rocm/build_rocm.sh" + --keep_image --py_version 3.10 + - cleanup: Remove workspace artifacts + if: always() + run: rm -rf ${{ github.workspace }} # - name: Try rocm-smi From 0bf3f5deef505c6d7a618fbcdada43d4c8ffb426 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Mon, 30 Sep 2024 22:23:42 +0000 Subject: [PATCH 10/22] fix name error --- .github/workflows/rocm-ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/rocm-ci.yaml b/.github/workflows/rocm-ci.yaml index 57a99c378788..9ba9e1f54b41 100644 --- a/.github/workflows/rocm-ci.yaml +++ b/.github/workflows/rocm-ci.yaml @@ -16,7 +16,7 @@ jobs: run: | ./build/rocm/ci_build.sh --rocm_version 6.1.3 \ --keep_image --py_version 3.10 - - cleanup: Remove workspace artifacts + - name: Cleanup if: always() run: rm -rf ${{ github.workspace }} From 5a50530dc027a94af895111d1e0d8039671d2adc Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Tue, 1 Oct 2024 15:56:27 +0000 Subject: [PATCH 11/22] delete the dist directory --- build/rocm/tools/build_wheels.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/build/rocm/tools/build_wheels.py b/build/rocm/tools/build_wheels.py index b6dd1256e2f5..01a1025409f6 100644 --- a/build/rocm/tools/build_wheels.py +++ b/build/rocm/tools/build_wheels.py @@ -254,6 +254,8 @@ def main(): if os.path.basename(whl).startswith("jax-"): LOG.info("Copying %s into %s" % (whl, wheelhouse_dir)) shutil.copy(whl, wheelhouse_dir) + # delete the 'dist' directory since it causes permissions issues + shutil.rmtree(os.path.join(args.jax_path, "dist")) if __name__ == "__main__": From c62d4b40ad54a0e8072d564b5ff3a31f6722200c Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Tue, 1 Oct 2024 16:47:36 +0000 Subject: [PATCH 12/22] remove the egg-info dir --- build/rocm/tools/build_wheels.py | 1 + 1 file changed, 1 insertion(+) diff --git a/build/rocm/tools/build_wheels.py b/build/rocm/tools/build_wheels.py index 01a1025409f6..fb1e55b09504 100644 --- a/build/rocm/tools/build_wheels.py +++ b/build/rocm/tools/build_wheels.py @@ -256,6 +256,7 @@ def main(): shutil.copy(whl, wheelhouse_dir) # delete the 'dist' directory since it causes permissions issues shutil.rmtree(os.path.join(args.jax_path, "dist")) + shutil.rmtree(os.path.join(args.jax_path, "jax.egg-info")) if __name__ == "__main__": From e9689d54f71a9e6aea72948c8c78b79f04f16d99 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Tue, 1 Oct 2024 16:58:57 +0000 Subject: [PATCH 13/22] more cleanup --- build/rocm/tools/build_wheels.py | 1 + 1 file changed, 1 insertion(+) diff --git a/build/rocm/tools/build_wheels.py b/build/rocm/tools/build_wheels.py index fb1e55b09504..f29ca4e57231 100644 --- a/build/rocm/tools/build_wheels.py +++ b/build/rocm/tools/build_wheels.py @@ -257,6 +257,7 @@ def main(): # delete the 'dist' directory since it causes permissions issues shutil.rmtree(os.path.join(args.jax_path, "dist")) shutil.rmtree(os.path.join(args.jax_path, "jax.egg-info")) + shutil.rmtree(os.path.join(args.jax_path, "jax", "__pycache__")) if __name__ == "__main__": From 6a49a56d91b4bf4138e80206d0f0f0dfef9bf1cb Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Tue, 1 Oct 2024 17:19:03 +0000 Subject: [PATCH 14/22] remove cleanup stage --- .github/workflows/rocm-ci.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/rocm-ci.yaml b/.github/workflows/rocm-ci.yaml index 9ba9e1f54b41..eefc072889c5 100644 --- a/.github/workflows/rocm-ci.yaml +++ b/.github/workflows/rocm-ci.yaml @@ -16,9 +16,6 @@ jobs: run: | ./build/rocm/ci_build.sh --rocm_version 6.1.3 \ --keep_image --py_version 3.10 - - name: Cleanup - if: always() - run: rm -rf ${{ github.workspace }} # - name: Try rocm-smi From 8de5fadac22f632fb1216cb8643ede5d7b3713f7 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Tue, 1 Oct 2024 17:39:12 +0000 Subject: [PATCH 15/22] archive wheels --- .github/workflows/rocm-ci.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/rocm-ci.yaml b/.github/workflows/rocm-ci.yaml index eefc072889c5..a2814cf6bc46 100644 --- a/.github/workflows/rocm-ci.yaml +++ b/.github/workflows/rocm-ci.yaml @@ -16,6 +16,11 @@ jobs: run: | ./build/rocm/ci_build.sh --rocm_version 6.1.3 \ --keep_image --py_version 3.10 + - name: Archive jax wheels + uses: actions/upload-artifact@v2 + with: + name: rocm_jax_r6_1_3_py3_10_id${{ github.run_id }} + path: ${{ github.workspace }}/wheelhouse/*.whl # - name: Try rocm-smi From b7a984850f73a9556ba4201f2c75ee8b9de57106 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Tue, 1 Oct 2024 17:40:47 +0000 Subject: [PATCH 16/22] fix action version --- .github/workflows/rocm-ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/rocm-ci.yaml b/.github/workflows/rocm-ci.yaml index a2814cf6bc46..5400daba43fa 100644 --- a/.github/workflows/rocm-ci.yaml +++ b/.github/workflows/rocm-ci.yaml @@ -17,7 +17,7 @@ jobs: ./build/rocm/ci_build.sh --rocm_version 6.1.3 \ --keep_image --py_version 3.10 - name: Archive jax wheels - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: rocm_jax_r6_1_3_py3_10_id${{ github.run_id }} path: ${{ github.workspace }}/wheelhouse/*.whl From 47521d09a004f62b4e1bf71fd55146d76e650fa9 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Tue, 1 Oct 2024 18:07:35 +0000 Subject: [PATCH 17/22] update permissions for wheels --- build/rocm/tools/build_wheels.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/build/rocm/tools/build_wheels.py b/build/rocm/tools/build_wheels.py index f29ca4e57231..b86d072cffb4 100644 --- a/build/rocm/tools/build_wheels.py +++ b/build/rocm/tools/build_wheels.py @@ -30,6 +30,7 @@ import subprocess import shutil import sys +import stat LOG = logging.getLogger(__name__) @@ -255,10 +256,22 @@ def main(): LOG.info("Copying %s into %s" % (whl, wheelhouse_dir)) shutil.copy(whl, wheelhouse_dir) # delete the 'dist' directory since it causes permissions issues + logging.info('Deleting dist, egg-info and cache directory') shutil.rmtree(os.path.join(args.jax_path, "dist")) shutil.rmtree(os.path.join(args.jax_path, "jax.egg-info")) shutil.rmtree(os.path.join(args.jax_path, "jax", "__pycache__")) + # make the wheels delete-abl by the runner + whl_house = os.join(args.jax_path, "wheelhouse") + logging.info(f'Changing permissions for {whl_house}') + mode = (stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | + stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP | + stat.S_IROTH | stat.S_IWOTH | stat.S_IXOTH ) + for item in os.listdir(whl_house): + whl_path = os.path.join(path, item) + if os.path.isfile(whl_path): + os.chmod(whl_path, mode) + if __name__ == "__main__": logging.basicConfig(level=logging.INFO) From f0125ad949d510bd95e5833008c51155902dde1c Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Tue, 1 Oct 2024 18:19:25 +0000 Subject: [PATCH 18/22] fix typo in script --- build/rocm/tools/build_wheels.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build/rocm/tools/build_wheels.py b/build/rocm/tools/build_wheels.py index b86d072cffb4..e60ee6c53dde 100644 --- a/build/rocm/tools/build_wheels.py +++ b/build/rocm/tools/build_wheels.py @@ -262,13 +262,13 @@ def main(): shutil.rmtree(os.path.join(args.jax_path, "jax", "__pycache__")) # make the wheels delete-abl by the runner - whl_house = os.join(args.jax_path, "wheelhouse") + whl_house = os.path.join(args.jax_path, "wheelhouse") logging.info(f'Changing permissions for {whl_house}') mode = (stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IWOTH | stat.S_IXOTH ) for item in os.listdir(whl_house): - whl_path = os.path.join(path, item) + whl_path = os.path.join(whl_house, item) if os.path.isfile(whl_path): os.chmod(whl_path, mode) From 4df858b78a35377a8d7472e5cad56183bbe70ec9 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Wed, 2 Oct 2024 02:37:34 +0000 Subject: [PATCH 19/22] override image name --- .github/workflows/rocm-ci.yaml | 20 ++++++++++---------- build/rocm/ci_build.sh | 4 +++- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/.github/workflows/rocm-ci.yaml b/.github/workflows/rocm-ci.yaml index 5400daba43fa..c913dd8d967f 100644 --- a/.github/workflows/rocm-ci.yaml +++ b/.github/workflows/rocm-ci.yaml @@ -11,7 +11,7 @@ jobs: - uses: actions/checkout@v4 - name: Build Docker env: - BUILD_TAG: rocm_jax_r6_1_3_py3_10_id${{ github.run_id }} + DOCKER_IMG_NAME: rocm_jax_r6_1_3_py3_10_id${{ github.run_id }} # XLA_CLONE_DIR: run: | ./build/rocm/ci_build.sh --rocm_version 6.1.3 \ @@ -23,12 +23,12 @@ jobs: path: ${{ github.workspace }}/wheelhouse/*.whl -# - name: Try rocm-smi -# run: | -# docker run \ -# --group-add video \ -# $RENDER_DEVICES \ -# --cap-add=SYS_PTRACE \ -# --security-opt seccomp=unconfined \ -# --shm-size=64G \ -# rocm/rocm-terminal rocm-smi \ No newline at end of file + - name: Detect GPUs + run: | + docker run \ + --group-add video \ + $RENDER_DEVICES \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --shm-size=64G \ + rocm/rocm-terminal rocm-smi diff --git a/build/rocm/ci_build.sh b/build/rocm/ci_build.sh index 302a0449b19e..4cd150720c2e 100755 --- a/build/rocm/ci_build.sh +++ b/build/rocm/ci_build.sh @@ -117,8 +117,10 @@ WORKSPACE="${WORKSPACE:-$(upsearch WORKSPACE)}" BUILD_TAG="${BUILD_TAG:-jax}" # Determine the docker image name and BUILD_TAG. -DOCKER_IMG_NAME="${BUILD_TAG}.${CONTAINER_TYPE}" +DOCKER_IMG_NAME_DEFAULT="${BUILD_TAG}.${CONTAINER_TYPE}" +# Let the env override the image name +DOCKER_IMG_NAME="${DOCKER_IMG_NAME:-$DOCKER_IMG_NAME_DEFAULT}" # Under Jenkins matrix build, the build tag may contain characters such as # commas (,) and equal signs (=), which are not valid inside docker image names. DOCKER_IMG_NAME=$(echo "${DOCKER_IMG_NAME}" | sed -e 's/=/_/g' -e 's/,/-/g') From 23dcab16b088aaf01b6c5bf9cc0d234f8c303ed2 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Wed, 2 Oct 2024 02:45:13 +0000 Subject: [PATCH 20/22] add test stage --- .github/workflows/rocm-ci.yaml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/rocm-ci.yaml b/.github/workflows/rocm-ci.yaml index c913dd8d967f..4799c30ba4e2 100644 --- a/.github/workflows/rocm-ci.yaml +++ b/.github/workflows/rocm-ci.yaml @@ -21,8 +21,6 @@ jobs: with: name: rocm_jax_r6_1_3_py3_10_id${{ github.run_id }} path: ${{ github.workspace }}/wheelhouse/*.whl - - - name: Detect GPUs run: | docker run \ @@ -32,3 +30,9 @@ jobs: --security-opt seccomp=unconfined \ --shm-size=64G \ rocm/rocm-terminal rocm-smi + - name: Run tests + env: + DOCKER_IMG_NAME: rocm_jax_r6_1_3_py3_10_id${{ github.run_id }} + run: | + ./build/rocm/ci_build test ${DOCKER_IMG_NAME} + From 8a6a2c93dad3d9f1f3b326a67f6c71b657903131 Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Wed, 2 Oct 2024 15:20:40 +0000 Subject: [PATCH 21/22] ensure stdin is a tty --- build/rocm/ci_build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/rocm/ci_build b/build/rocm/ci_build index 3c051be8c027..9a2e9b19ec65 100755 --- a/build/rocm/ci_build +++ b/build/rocm/ci_build @@ -99,7 +99,7 @@ def dist_wheels( cmd.extend(mounts) - if os.isatty(sys.stdout.fileno()): + if os.isatty(sys.stdout.fileno()) and os.isatty(sys.stdin.fileno()): cmd.append("-it") # NOTE(mrodden): bazel times out without --init, probably blocking on a zombie PID From 6a0d1a63805353d8f5881c7db71350f76d5e4c9b Mon Sep 17 00:00:00 2001 From: Jehandad Khan Date: Wed, 2 Oct 2024 15:41:36 +0000 Subject: [PATCH 22/22] guard -it for docker with check --- build/rocm/ci_build | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/build/rocm/ci_build b/build/rocm/ci_build index 9a2e9b19ec65..3519122dfbed 100755 --- a/build/rocm/ci_build +++ b/build/rocm/ci_build @@ -99,7 +99,7 @@ def dist_wheels( cmd.extend(mounts) - if os.isatty(sys.stdout.fileno()) and os.isatty(sys.stdin.fileno()): + if os.isatty(sys.stdout.fileno()): cmd.append("-it") # NOTE(mrodden): bazel times out without --init, probably blocking on a zombie PID @@ -208,10 +208,12 @@ def test(image_name): cmd = [ "docker", "run", - "-it", "--rm", ] + if os.isatty(sys.stdout.fileno()): + cmd.append("-it") + # NOTE(mrodden): we need jax source dir for the unit test code only, # JAX and jaxlib are already installed from wheels # docker run fails when mounting ./