CI: update to ROCm 6.0.2 and test MI300 (#30266)
* update to ROCm 6.0.2 and test MI300 * add callers for mi300 * update dockerfile * fix trainer tests * remove apex * style * Update tests/trainer/test_trainer_seq2seq.py * Update tests/trainer/test_trainer_seq2seq.py * Update tests/trainer/test_trainer_seq2seq.py * Update tests/trainer/test_trainer_seq2seq.py * update to torch 2.3 * add workflow dispatch target * we may need branches: mi300-ci after all * nit * fix docker build * nit * add check runner * remove docker-gpu * fix issues * fix --------- Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
8
.github/workflows/self-push-amd.yml
vendored
8
.github/workflows/self-push-amd.yml
vendored
@@ -36,7 +36,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
||||
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@@ -57,7 +57,7 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
||||
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@@ -155,7 +155,7 @@ jobs:
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }}
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
||||
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@@ -230,7 +230,7 @@ jobs:
|
||||
- name: Run all non-slow selected tests on GPU
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports ${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }}
|
||||
python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports ${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }} -m "not not_device_test"
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
|
||||
Reference in New Issue
Block a user