From d903abfccc0f0fc8e73364cf5418a26118bda99e Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Fri, 17 Nov 2023 10:44:37 +0100 Subject: [PATCH] Fix AMD CI not showing GPU (#27555) fix Co-authored-by: ydshieh --- .github/workflows/self-push-amd.yml | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/.github/workflows/self-push-amd.yml b/.github/workflows/self-push-amd.yml index c72f224a30..19857981b1 100644 --- a/.github/workflows/self-push-amd.yml +++ b/.github/workflows/self-push-amd.yml @@ -38,14 +38,16 @@ jobs: runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now - options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: ROCM-SMI run: | - rocminfo | grep "Agent" -A 14 - - name: Show HIP environment + rocm-smi + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + - name: Show ROCR environment run: | - echo "HIP: $HIP_VISIBLE_DEVICES" echo "ROCR: $ROCR_VISIBLE_DEVICES" setup_gpu: @@ -57,7 +59,7 @@ jobs: runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now - options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} test_map: ${{ steps.set-matrix.outputs.test_map }} @@ -155,7 +157,7 @@ jobs: runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now - options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: # Necessary to get the correct branch name and commit SHA for `workflow_run` event # We also take into account the `push` event (we might want to test some changes in a branch) @@ -207,10 +209,12 @@ jobs: - name: ROCM-SMI run: | - rocminfo | grep "Agent" -A 14 - - name: Show HIP environment + rocm-smi + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + - name: Show ROCR environment run: | - echo "HIP: $HIP_VISIBLE_DEVICES" echo "ROCR: $ROCR_VISIBLE_DEVICES" - name: Environment