Skip to content

Commit

Permalink
Rocm skip for multigpu fail on lower asic
Browse files Browse the repository at this point in the history
Add fork maintenance system and add skip for multigpu issue
  • Loading branch information
Cemberk committed Nov 20, 2024
1 parent ae1580f commit bb77cb1
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 0 deletions.
30 changes: 30 additions & 0 deletions .github/workflows/fork-maintenance.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: Run Scheduled Events Docker

permissions:
actions: write
contents: write
issues: write
pull-requests: write

on:
workflow_dispatch:
schedule:
- cron: '0 0 10 * *'

jobs:
run-scheduled-events:
uses: Cemberk/fork-maintenance-system/.github/workflows/fork-maintenance-action.yml@artifacts
with:
platform: 'gfx90a'
upstream_repo: 'https://github.com/huggingface/transformers'
pr_branch_prefix: 'scheduled-merge'
requirements_command: 'sudo sed -i \"s/torchaudio//g\" examples/pytorch/_tests_requirements.txt && pip install -r examples/pytorch/_tests_requirements.txt && git restore examples/pytorch/_tests_requirements.txt && pip install --no-cache-dir GPUtil azureml azureml-core tokenizers ninja cerberus sympy sacremoses sacrebleu==1.5.1 sentencepiece scipy scikit-learn urllib3 && pip install huggingface_hub datasets && pip install parameterized && pip install -e .'
unit_test_command: cd tests; folders=\$(python3 -c \import os; tests = os.getcwd(); models = \"models\"; model_tests = os.listdir(os.path.join(tests, models)); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [os.path.join(models, x) for x in model_tests]))); d1.remove(models); d = d2 + d1; print(\" \".join(d[:5]))' ); cd ..; for folder in \${folders[@]}; do pytest tests/\${folder} -v --make-reports=huggingface_unit_tests_\${machine_type}_run_models_gpu_\${folder} -rfEs --continue-on-collection-errors -m \"not not_device_test\" -p no:cacheprovider; done; allstats=\$(find reports -name stats.txt); for stat in \${allstats[@]}; do echo \$stat; cat \$stat; done
performance_test_command: 'echo \"python examples/pytorch/language-modeling/run_mlm.py --model_name_or_path bert-base-uncased --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train --do_eval --output_dir /tmp/test-mlm --per_device_train_batch_size 8 --per_device_eval_batch_size 8 --max_steps 500\"'
docker_image: 'rocm/pytorch:latest'
docker_options: '--device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --network=host'
secrets:
GIT_TOKEN: ${{ secrets.CRED_TOKEN }}
schedule_json: ${{ secrets.SCHEDULE_CONFIG }}


2 changes: 2 additions & 0 deletions tests/test_modeling_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2818,6 +2818,8 @@ def test_inputs_embeds_matches_input_ids(self):
)[0]
self.assertTrue(torch.allclose(out_embeds, out_ids))


@skipIfRocm
@require_torch_multi_gpu
def test_multi_gpu_data_parallel_forward(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
Expand Down

0 comments on commit bb77cb1

Please sign in to comment.