[AMD] Add initial version for run_tests_multi_gpu (#26346)
* Add initial version for run_tests_multi_gpu * Trigger change in BERT * fix typo setup -> setup_gpu * Add tag mi210 * Enable multi-gpu jobs * One more * Use dynamic device allocation * Attempt to fix syntax for docker create * fix script path * fix * temp machine type * fix label * Enable multi-gpu tests * Rename multi-amd-gpu to multi-gpu * Let's not be lazy dude * Update rocm-smi output * Add gpu_flavour in the matrix * Fix typos * merge single/multi dispatch into the matrix * Format. * Revert BERT's change --------- Co-authored-by: Guillaume LEGENDRE <glegendre01@gmail.com>
This commit is contained in:
41
.github/workflows/self-push-amd.yml
vendored
41
.github/workflows/self-push-amd.yml
vendored
@@ -44,28 +44,32 @@ jobs:
|
|||||||
needs: check_runner_status
|
needs: check_runner_status
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
machine_type: [single-gpu]
|
machine_type: [single-gpu, multi-gpu]
|
||||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', mi210]
|
gpu_flavor: [mi210]
|
||||||
|
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
|
||||||
container:
|
container:
|
||||||
# --device /dev/dri/renderD128 == AMDGPU:0 (indexing for AMDGPU starts at 128 ...)
|
|
||||||
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
||||||
options: --device /dev/kfd --device /dev/dri/renderD128 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
steps:
|
steps:
|
||||||
- name: ROCM-SMI
|
- name: ROCM-SMI
|
||||||
run: |
|
run: |
|
||||||
rocm-smi
|
rocminfo | grep "Agent" -A 14
|
||||||
|
- name: Show HIP environment
|
||||||
|
run: |
|
||||||
|
echo "HIP: $HIP_VISIBLE_DEVICES"
|
||||||
|
echo "ROCR: $ROCR_VISIBLE_DEVICES"
|
||||||
|
|
||||||
setup_gpu:
|
setup_gpu:
|
||||||
name: Setup
|
name: Setup
|
||||||
needs: check_runners
|
needs: check_runners
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
machine_type: [single-gpu]
|
machine_type: [single-gpu, multi-gpu]
|
||||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', mi210]
|
gpu_flavor: [mi210]
|
||||||
|
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
|
||||||
container:
|
container:
|
||||||
# --device /dev/dri/renderD128 == AMDGPU:0 (indexing for AMDGPU starts at 128 ...)
|
|
||||||
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
||||||
options: --device /dev/kfd --device /dev/dri/renderD128 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
outputs:
|
outputs:
|
||||||
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||||
test_map: ${{ steps.set-matrix.outputs.test_map }}
|
test_map: ${{ steps.set-matrix.outputs.test_map }}
|
||||||
@@ -150,7 +154,7 @@ jobs:
|
|||||||
echo "matrix=$keys" >> $GITHUB_OUTPUT
|
echo "matrix=$keys" >> $GITHUB_OUTPUT
|
||||||
echo "test_map=$test_map" >> $GITHUB_OUTPUT
|
echo "test_map=$test_map" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
run_tests_single_gpu:
|
run_tests_amdgpu:
|
||||||
name: Model tests
|
name: Model tests
|
||||||
needs: setup_gpu
|
needs: setup_gpu
|
||||||
# `dummy` means there is no test to run
|
# `dummy` means there is no test to run
|
||||||
@@ -159,12 +163,12 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }}
|
folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }}
|
||||||
machine_type: [single-gpu]
|
machine_type: [single-gpu, multi-gpu]
|
||||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', mi210]
|
gpu_flavor: [mi210]
|
||||||
|
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
|
||||||
container:
|
container:
|
||||||
# --device /dev/dri/renderD128 == AMDGPU:0 (indexing for AMDGPU starts at 128 ...)
|
|
||||||
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
||||||
options: --device /dev/kfd --device /dev/dri/renderD128 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
steps:
|
steps:
|
||||||
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
|
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
|
||||||
# We also take into account the `push` event (we might want to test some changes in a branch)
|
# We also take into account the `push` event (we might want to test some changes in a branch)
|
||||||
@@ -216,7 +220,11 @@ jobs:
|
|||||||
|
|
||||||
- name: ROCM-SMI
|
- name: ROCM-SMI
|
||||||
run: |
|
run: |
|
||||||
rocm-smi
|
rocminfo | grep "Agent" -A 14
|
||||||
|
- name: Show HIP environment
|
||||||
|
run: |
|
||||||
|
echo "HIP: $HIP_VISIBLE_DEVICES"
|
||||||
|
echo "ROCR: $ROCR_VISIBLE_DEVICES"
|
||||||
|
|
||||||
- name: Environment
|
- name: Environment
|
||||||
working-directory: /transformers
|
working-directory: /transformers
|
||||||
@@ -252,8 +260,7 @@ jobs:
|
|||||||
check_runner_status,
|
check_runner_status,
|
||||||
check_runners,
|
check_runners,
|
||||||
setup_gpu,
|
setup_gpu,
|
||||||
run_tests_single_gpu,
|
run_tests_amdgpu,
|
||||||
# run_tests_multi_gpu,
|
|
||||||
# run_tests_torch_cuda_extensions_single_gpu,
|
# run_tests_torch_cuda_extensions_single_gpu,
|
||||||
# run_tests_torch_cuda_extensions_multi_gpu
|
# run_tests_torch_cuda_extensions_multi_gpu
|
||||||
]
|
]
|
||||||
|
|||||||
Reference in New Issue
Block a user