[AMD] Add initial version for run_tests_multi_gpu (#26346)

* Add initial version for run_tests_multi_gpu * Trigger change in BERT * fix typo setup -> setup_gpu * Add tag mi210 * Enable multi-gpu jobs * One more * Use dynamic device allocation * Attempt to fix syntax for docker create * fix script path * fix * temp machine type * fix label * Enable multi-gpu tests * Rename multi-amd-gpu to multi-gpu * Let's not be lazy dude * Update rocm-smi output * Add gpu_flavour in the matrix * Fix typos * merge single/multi dispatch into the matrix * Format. * Revert BERT's change --------- Co-authored-by: Guillaume LEGENDRE <glegendre01@gmail.com>
2023-10-03 11:13:45 +02:00
parent 768aa3d9cd
commit 3632fb3c25
1 changed files with 24 additions and 17 deletions
--- a/.github/workflows/self-push-amd.yml
+++ b/.github/workflows/self-push-amd.yml
@@ -44,28 +44,32 @@ jobs:
    needs: check_runner_status
    strategy:
      matrix:
-        machine_type: [single-gpu]
+        machine_type: [single-gpu, multi-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', mi210]
+        gpu_flavor: [mi210]
    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
    container:
      # --device /dev/dri/renderD128 == AMDGPU:0 (indexing for AMDGPU starts at 128 ...)
      image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
-      options: --device /dev/kfd --device /dev/dri/renderD128 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: ROCM-SMI
        run: |
-          rocm-smi
+          rocminfo  | grep "Agent" -A 14
      - name: Show HIP environment
        run: |
          echo "HIP: $HIP_VISIBLE_DEVICES"
          echo "ROCR: $ROCR_VISIBLE_DEVICES"
  setup_gpu:
    name: Setup
    needs: check_runners
    strategy:
      matrix:
-        machine_type: [single-gpu]
+        machine_type: [single-gpu, multi-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', mi210]
+        gpu_flavor: [mi210]
    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
    container:
      # --device /dev/dri/renderD128 == AMDGPU:0 (indexing for AMDGPU starts at 128 ...)
      image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
-      options: --device /dev/kfd --device /dev/dri/renderD128 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    outputs:
      matrix: ${{ steps.set-matrix.outputs.matrix }}
      test_map: ${{ steps.set-matrix.outputs.test_map }}
@@ -150,7 +154,7 @@ jobs:
          echo "matrix=$keys" >> $GITHUB_OUTPUT
          echo "test_map=$test_map" >> $GITHUB_OUTPUT
-  run_tests_single_gpu:
+  run_tests_amdgpu:
    name: Model tests
    needs: setup_gpu
    # `dummy` means there is no test to run
@@ -159,12 +163,12 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }}
-        machine_type: [single-gpu]
+        machine_type: [single-gpu, multi-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', mi210]
+        gpu_flavor: [mi210]
    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
    container:
      # --device /dev/dri/renderD128 == AMDGPU:0 (indexing for AMDGPU starts at 128 ...)
      image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
-      options: --device /dev/kfd --device /dev/dri/renderD128 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
      # We also take into account the `push` event (we might want to test some changes in a branch)
@@ -216,7 +220,11 @@ jobs:
      - name: ROCM-SMI
        run: |
-          rocm-smi
+          rocminfo  | grep "Agent" -A 14
      - name: Show HIP environment
        run: |
          echo "HIP: $HIP_VISIBLE_DEVICES"
          echo "ROCR: $ROCR_VISIBLE_DEVICES"
      - name: Environment
        working-directory: /transformers
@@ -252,8 +260,7 @@ jobs:
        check_runner_status,
        check_runners,
        setup_gpu,
-        run_tests_single_gpu,
+        run_tests_amdgpu,
 #        run_tests_multi_gpu,
 #        run_tests_torch_cuda_extensions_single_gpu,
 #        run_tests_torch_cuda_extensions_multi_gpu
    ]