CI: update to ROCm 6.0.2 and test MI300 (#30266)

* update to ROCm 6.0.2 and test MI300

* add callers for mi300

* update dockerfile

* fix trainer tests

* remove apex

* style

* Update tests/trainer/test_trainer_seq2seq.py

* Update tests/trainer/test_trainer_seq2seq.py

* Update tests/trainer/test_trainer_seq2seq.py

* Update tests/trainer/test_trainer_seq2seq.py

* update to torch 2.3

* add workflow dispatch target

* we may need branches: mi300-ci after all

* nit

* fix docker build

* nit

* add check runner

* remove docker-gpu

* fix issues

* fix

---------

Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
fxmarty
2024-05-13 18:14:36 +02:00
committed by GitHub
parent 539ed75d50
commit 37bba2a32d
14 changed files with 170 additions and 81 deletions

View File

@@ -0,0 +1,25 @@
name: Self-hosted runner (AMD mi300 CI caller)
on:
workflow_run:
workflows: ["Self-hosted runner (push-caller)"]
branches: ["main"]
types: [completed]
push:
branches:
- run_amd_push_ci_caller*
paths:
- "src/**"
- "tests/**"
- ".github/**"
- "templates/**"
- "utils/**"
jobs:
run_amd_ci:
name: AMD mi300
if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && (startsWith(github.ref_name, 'run_amd_push_ci_caller') || startsWith(github.ref_name, 'mi300-ci'))))
uses: ./.github/workflows/self-push-amd.yml
with:
gpu_flavor: mi300
secrets: inherit

View File

@@ -36,7 +36,7 @@ jobs:
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
container:
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -57,7 +57,7 @@ jobs:
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
container:
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -155,7 +155,7 @@ jobs:
matrix:
folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }}
machine_type: [single-gpu, multi-gpu]
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
container:
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -230,7 +230,7 @@ jobs:
- name: Run all non-slow selected tests on GPU
working-directory: /transformers
run: |
python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports ${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }}
python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports ${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }} -m "not not_device_test"
- name: Failure short reports
if: ${{ failure() }}

View File

@@ -16,4 +16,5 @@ jobs:
uses: ./.github/workflows/self-scheduled-amd.yml
with:
gpu_flavor: mi210
slack_report_channel: "#transformers-ci-daily-amd"
secrets: inherit

View File

@@ -16,4 +16,5 @@ jobs:
uses: ./.github/workflows/self-scheduled-amd.yml
with:
gpu_flavor: mi250
slack_report_channel: "#transformers-ci-daily-amd"
secrets: inherit

View File

@@ -0,0 +1,21 @@
name: Self-hosted runner (AMD mi300 scheduled CI caller)
on:
workflow_run:
workflows: ["Self-hosted runner (AMD scheduled CI caller)"]
branches: ["main"]
types: [completed]
push:
branches:
- run_amd_scheduled_ci_caller*
jobs:
run_amd_ci:
name: AMD mi300
needs: build-docker-containers
if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && (startsWith(github.ref_name, 'run_amd_push_ci_caller') || startsWith(github.ref_name, 'mi300-ci'))))
uses: ./.github/workflows/self-scheduled-amd.yml
with:
gpu_flavor: mi300
slack_report_channel: "#transformers-ci-daily-amd"
secrets: inherit

View File

@@ -34,7 +34,7 @@ jobs:
fetch-depth: 2
- name: Check Runner Status
run: python utils/check_self_hosted_runner.py --target_runners hf-amd-mi210-ci-1gpu-1,hf-amd-mi250-ci-1gpu-1 --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
run: python utils/check_self_hosted_runner.py --target_runners hf-amd-mi210-ci-1gpu-1,hf-amd-mi250-ci-1gpu-1,hf-amd-mi300-ci-1gpu-1 --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
check_runners:
name: Check Runners
@@ -42,7 +42,7 @@ jobs:
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
container:
image: huggingface/transformers-pytorch-amd-gpu
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -63,7 +63,7 @@ jobs:
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
container:
image: huggingface/transformers-pytorch-amd-gpu
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -116,7 +116,7 @@ jobs:
matrix:
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
machine_type: [single-gpu]
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
container:
image: huggingface/transformers-pytorch-amd-gpu
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -162,7 +162,7 @@ jobs:
- name: Run all tests on GPU
working-directory: /transformers
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} -m "not not_device_test"
- name: Failure short reports
if: ${{ failure() }}
@@ -184,7 +184,7 @@ jobs:
matrix:
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
machine_type: [multi-gpu]
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
container:
image: huggingface/transformers-pytorch-amd-gpu
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -230,7 +230,7 @@ jobs:
- name: Run all tests on GPU
working-directory: /transformers
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} -m "not not_device_test"
- name: Failure short reports
if: ${{ failure() }}
@@ -250,7 +250,7 @@ jobs:
fail-fast: false
matrix:
machine_type: [single-gpu]
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
container:
image: huggingface/transformers-pytorch-amd-gpu
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -287,7 +287,7 @@ jobs:
working-directory: /transformers
run: |
pip install -r examples/pytorch/_tests_requirements.txt
python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_examples_gpu_test_reports examples/pytorch
python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_examples_gpu_test_reports examples/pytorch -m "not not_device_test"
- name: Failure short reports
if: ${{ failure() }}
@@ -307,7 +307,7 @@ jobs:
fail-fast: false
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
container:
image: huggingface/transformers-pytorch-amd-gpu
options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -343,7 +343,7 @@ jobs:
- name: Run all pipeline tests on GPU
working-directory: /transformers
run: |
python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines
python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines -m "not not_device_test"
- name: Failure short reports
if: ${{ failure() }}
@@ -364,7 +364,7 @@ jobs:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
needs: setup
container:
image: huggingface/transformers-pytorch-deepspeed-amd-gpu
@@ -400,7 +400,7 @@ jobs:
- name: Run all tests on GPU
working-directory: /transformers
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended -m "not not_device_test"
- name: Failure short reports
if: ${{ failure() }}