Use AMD CI workflow defined in hf-workflows (#35058)

* Use AMD CI workflow defined in hf-workflows
2025-01-17 20:52:57 +01:00
parent 7d4b3ddde4
commit 5fa3534475
4 changed files with 113 additions and 462 deletions
--- a/.github/workflows/self-scheduled-amd-mi210-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi210-caller.yml
@@ -1,55 +1,55 @@
-name: Self-hosted runner (AMD mi210 scheduled CI caller)
+name: Self-hosted runner (AMD mi210 scheduled CI caller)
-
+
-on:
+on:
-  workflow_run:
+  workflow_run:
-    workflows: ["Self-hosted runner (AMD scheduled CI caller)"]
+    workflows: ["Self-hosted runner (AMD scheduled CI caller)"]
-    branches: ["main"]
+    branches: ["main"]
-    types: [completed]
+    types: [completed]
-  push:
+  push:
-    branches:
+    branches:
-      - run_amd_scheduled_ci_caller*
+      - run_amd_scheduled_ci_caller*
-
+
-jobs:
+jobs:
-  model-ci:
+  model-ci:
-    name: Model CI
+    name: Model CI
-    uses: ./.github/workflows/self-scheduled-amd.yml
+    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
-    with:
+    with:
-      job: run_models_gpu
+      job: run_models_gpu
-      slack_report_channel: "#transformers-ci-daily-amd"
+      slack_report_channel: "#transformers-ci-daily-amd"
-      runner: mi210
+      runner: mi210
-      docker: huggingface/transformers-pytorch-amd-gpu
+      docker: huggingface/transformers-pytorch-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi210
+      ci_event: Scheduled CI (AMD) - mi210
-    secrets: inherit
+    secrets: inherit
-
+
-  torch-pipeline:
+  torch-pipeline:
-    name: Torch pipeline CI
+    name: Torch pipeline CI
-    uses: ./.github/workflows/self-scheduled-amd.yml
+    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
-    with:
+    with:
-      job: run_pipelines_torch_gpu
+      job: run_pipelines_torch_gpu
-      slack_report_channel: "#transformers-ci-daily-amd"
+      slack_report_channel: "#transformers-ci-daily-amd"
-      runner: mi210
+      runner: mi210
-      docker: huggingface/transformers-pytorch-amd-gpu
+      docker: huggingface/transformers-pytorch-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi210
+      ci_event: Scheduled CI (AMD) - mi210
-    secrets: inherit
+    secrets: inherit
-
+
-  example-ci:
+  example-ci:
-    name: Example CI
+    name: Example CI
-    uses: ./.github/workflows/self-scheduled-amd.yml
+    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
-    with:
+    with:
-      job: run_examples_gpu
+      job: run_examples_gpu
-      slack_report_channel: "#transformers-ci-daily-amd"
+      slack_report_channel: "#transformers-ci-daily-amd"
-      runner: mi210
+      runner: mi210
-      docker: huggingface/transformers-pytorch-amd-gpu
+      docker: huggingface/transformers-pytorch-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi210
+      ci_event: Scheduled CI (AMD) - mi210
-    secrets: inherit
+    secrets: inherit
-
+
-  deepspeed-ci:
+  deepspeed-ci:
-    name: DeepSpeed CI
+    name: DeepSpeed CI
-    uses: ./.github/workflows/self-scheduled-amd.yml
+    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
-    with:
+    with:
-      job: run_torch_cuda_extensions_gpu
+      job: run_torch_cuda_extensions_gpu
-      slack_report_channel: "#transformers-ci-daily-amd"
+      slack_report_channel: "#transformers-ci-daily-amd"
-      runner: mi210
+      runner: mi210
-      docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
+      docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi210
+      ci_event: Scheduled CI (AMD) - mi210
-    secrets: inherit
+    secrets: inherit
--- a/.github/workflows/self-scheduled-amd-mi250-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi250-caller.yml
@@ -1,55 +1,55 @@
-name: Self-hosted runner (AMD mi250 scheduled CI caller)
+name: Self-hosted runner (AMD mi250 scheduled CI caller)
-
+
-on:
+on:
-  workflow_run:
+  workflow_run:
-    workflows: ["Self-hosted runner (AMD scheduled CI caller)"]
+    workflows: ["Self-hosted runner (AMD scheduled CI caller)"]
-    branches: ["main"]
+    branches: ["main"]
-    types: [completed]
+    types: [completed]
-  push:
+  push:
-    branches:
+    branches:
-      - run_amd_scheduled_ci_caller*
+      - run_amd_scheduled_ci_caller*
-
+
-jobs:
+jobs:
-  model-ci:
+  model-ci:
-    name: Model CI
+    name: Model CI
-    uses: ./.github/workflows/self-scheduled-amd.yml
+    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
-    with:
+    with:
-      job: run_models_gpu
+      job: run_models_gpu
-      slack_report_channel: "#transformers-ci-daily-amd"
+      slack_report_channel: "#transformers-ci-daily-amd"
-      runner: mi250
+      runner: mi250
-      docker: huggingface/transformers-pytorch-amd-gpu
+      docker: huggingface/transformers-pytorch-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi250
+      ci_event: Scheduled CI (AMD) - mi250
-    secrets: inherit
+    secrets: inherit
-
+
-  torch-pipeline:
+  torch-pipeline:
-    name: Torch pipeline CI
+    name: Torch pipeline CI
-    uses: ./.github/workflows/self-scheduled-amd.yml
+    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
-    with:
+    with:
-      job: run_pipelines_torch_gpu
+      job: run_pipelines_torch_gpu
-      slack_report_channel: "#transformers-ci-daily-amd"
+      slack_report_channel: "#transformers-ci-daily-amd"
-      runner: mi250
+      runner: mi250
-      docker: huggingface/transformers-pytorch-amd-gpu
+      docker: huggingface/transformers-pytorch-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi250
+      ci_event: Scheduled CI (AMD) - mi250
-    secrets: inherit
+    secrets: inherit
-
+
-  example-ci:
+  example-ci:
-    name: Example CI
+    name: Example CI
-    uses: ./.github/workflows/self-scheduled-amd.yml
+    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
-    with:
+    with:
-      job: run_examples_gpu
+      job: run_examples_gpu
-      slack_report_channel: "#transformers-ci-daily-amd"
+      slack_report_channel: "#transformers-ci-daily-amd"
-      runner: mi250
+      runner: mi250
-      docker: huggingface/transformers-pytorch-amd-gpu
+      docker: huggingface/transformers-pytorch-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi250
+      ci_event: Scheduled CI (AMD) - mi250
-    secrets: inherit
+    secrets: inherit
-
+
-  deepspeed-ci:
+  deepspeed-ci:
-    name: DeepSpeed CI
+    name: DeepSpeed CI
-    uses: ./.github/workflows/self-scheduled-amd.yml
+    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
-    with:
+    with:
-      job: run_torch_cuda_extensions_gpu
+      job: run_torch_cuda_extensions_gpu
-      slack_report_channel: "#transformers-ci-daily-amd"
+      slack_report_channel: "#transformers-ci-daily-amd"
-      runner: mi250
+      runner: mi250
-      docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
+      docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
-      ci_event: Scheduled CI (AMD) - mi250
+      ci_event: Scheduled CI (AMD) - mi250
-    secrets: inherit
+    secrets: inherit
--- a/.github/workflows/self-scheduled-amd.yml
+++ b/.github/workflows/self-scheduled-amd.yml
@@ -1,349 +0,0 @@
 name: Self-hosted runner (scheduled-amd)
 # Note: For the AMD CI, we rely on a caller workflow and on the workflow_call event to trigger the
 # CI in order to run it on both MI210 and MI250, without having to use matrix here which pushes
 # us towards the limit of allowed jobs on GitHub Actions.
 on:
  workflow_call:
    inputs:
      job:
        required: true
        type: string
      slack_report_channel:
        required: true
        type: string
      runner:
        required: true
        type: string
      docker:
        required: true
        type: string
      ci_event:
        required: true
        type: string
 env:
  HF_HOME: /mnt/cache
  TRANSFORMERS_IS_CI: yes
  OMP_NUM_THREADS: 8
  MKL_NUM_THREADS: 8
  RUN_SLOW: yes
  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
  NUM_SLICES: 2
 # Important note: each job (run_tests_single_gpu, run_tests_multi_gpu, run_examples_gpu, run_pipelines_torch_gpu) requires all the previous jobs before running.
 # This is done so that we avoid parallelizing the scheduled tests, to leave available
 # runners for the push CI that is running on the same machine.
 jobs:
  check_runner_status:
    name: Check Runner Status
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout transformers
        uses: actions/checkout@v4
        with:
          fetch-depth: 2
      - name: Check Runner Status
        run: python utils/check_self_hosted_runner.py --target_runners hf-amd-mi210-ci-1gpu-1,hf-amd-mi250-ci-1gpu-1,hf-amd-mi300-ci-1gpu-1 --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
  check_runners:
    name: Check Runners
    needs: check_runner_status
    strategy:
      matrix:
        machine_type: [single-gpu, multi-gpu]
    runs-on: ['${{ matrix.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
    container:
      image: huggingface/transformers-pytorch-amd-gpu
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: ROCM-SMI
        run: |
          rocm-smi
      - name: ROCM-INFO
        run: |
          rocminfo  | grep "Agent" -A 14
      - name: Show ROCR environment
        run: |
          echo "ROCR: $ROCR_VISIBLE_DEVICES"
  setup:
    if: contains(fromJSON('["run_models_gpu"]'), inputs.job)
    name: Setup
    needs: check_runners
    strategy:
      matrix:
        machine_type: [single-gpu, multi-gpu]
    runs-on: ['${{ matrix.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
    container:
      image: huggingface/transformers-pytorch-amd-gpu
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    outputs:
      folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
      slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
    steps:
      - name: Update clone
        working-directory: /transformers
        run: |
          git fetch && git checkout ${{ github.sha }}
      - name: Cleanup
        working-directory: /transformers
        run: |
          rm -rf tests/__pycache__
          rm -rf tests/models/__pycache__
          rm -rf reports
      - name: Show installed libraries and their versions
        working-directory: /transformers
        run: pip freeze
      - id: set-matrix
        name: Identify models to test
        working-directory: /transformers/tests
        run: |
          echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
          echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
      - name: ROCM-SMI
        run: |
          rocm-smi
      - name: ROCM-INFO
        run: |
          rocminfo  | grep "Agent" -A 14
      - name: Show ROCR environment
        run: |
          echo "ROCR: $ROCR_VISIBLE_DEVICES"
      - name: Environment
        working-directory: /transformers
        run: |
          python3 utils/print_env.py
  run_models_gpu:
    if: ${{ inputs.job == 'run_models_gpu' }}
    name: Single GPU tests
    needs: setup
    strategy:
      max-parallel: 1  # For now, not to parallelize. Can change later if it works well.
      fail-fast: false
      matrix:
        machine_type: [single-gpu, multi-gpu]
        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
    uses: ./.github/workflows/model_jobs_amd.yml
    with:
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      machine_type: ${{ matrix.machine_type }}
      slice_id: ${{ matrix.slice_id }}
      runner: ${{ inputs.runner }}
      docker: ${{ inputs.docker }}
    secrets: inherit
  run_pipelines_torch_gpu:
    if: ${{ inputs.job == 'run_pipelines_torch_gpu' }}
    name: PyTorch pipelines
    needs: check_runners
    strategy:
      fail-fast: false
      matrix:
        machine_type: [single-gpu, multi-gpu]
    runs-on: ['${{ matrix.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
    container:
      image: ${{ inputs.docker }}
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Update clone
        working-directory: /transformers
        run: git fetch && git checkout ${{ github.sha }}
      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
      - name: ROCM-SMI
        run: |
          rocm-smi
      - name: ROCM-INFO
        run: |
          rocminfo  | grep "Agent" -A 14
      - name: Show ROCR environment
        run: |
          echo "ROCR: $ROCR_VISIBLE_DEVICES"
      - name: Environment
        working-directory: /transformers
        run: |
          python3 utils/print_env.py
      - name: Show installed libraries and their versions
        working-directory: /transformers
        run: pip freeze
      - name: Run all pipeline tests on GPU
        working-directory: /transformers
        run: |
          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines -m "not not_device_test"
      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: cat /transformers/reports/${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt
      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
          name: ${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports
          path: /transformers/reports/${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports
  run_examples_gpu:
    if: ${{ inputs.job == 'run_examples_gpu' }}
    name: Examples directory
    needs: check_runners
    strategy:
      fail-fast: false
      matrix:
        machine_type: [single-gpu]
    runs-on: ['${{ matrix.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
    container:
      image: ${{ inputs.docker }}
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Update clone
        working-directory: /transformers
        run: git fetch && git checkout ${{ github.sha }}
      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
      - name: ROCM-SMI
        run: |
          rocm-smi
      - name: ROCM-INFO
        run: |
          rocminfo  | grep "Agent" -A 14
      - name: Show ROCR environment
        run: |
          echo "ROCR: $ROCR_VISIBLE_DEVICES"
      - name: Environment
        working-directory: /transformers
        run: |
          python3 utils/print_env.py
      - name: Show installed libraries and their versions
        working-directory: /transformers
        run: pip freeze
      - name: Run examples tests on GPU
        working-directory: /transformers
        run: |
          pip install -r examples/pytorch/_tests_requirements.txt
          python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_examples_gpu_test_reports examples/pytorch -m "not not_device_test"
      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: cat /transformers/reports/${{ matrix.machine_type }}_run_examples_gpu_test_reports/failures_short.txt
      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
          name: ${{ matrix.machine_type }}_run_examples_gpu_test_reports
          path: /transformers/reports/${{ matrix.machine_type }}_run_examples_gpu_test_reports
  run_torch_cuda_extensions_gpu:
    if: ${{ inputs.job == 'run_torch_cuda_extensions_gpu' }}
    name: Torch ROCm deepspeed tests
    needs: check_runners
    strategy:
      fail-fast: false
      matrix:
        machine_type: [single-gpu, multi-gpu]
    runs-on: ['${{ matrix.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
    container:
      image: ${{ inputs.docker }}
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Update clone
        working-directory: /transformers
        run: git fetch && git checkout ${{ github.sha }}
      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
      - name: ROCM-SMI
        run: |
          rocm-smi
      - name: ROCM-INFO
        run: |
          rocminfo  | grep "Agent" -A 14
      - name: Show ROCR environment
        run: |
          echo "ROCR: $ROCR_VISIBLE_DEVICES"
      - name: Environment
        working-directory: /transformers
        run: |
          python3 utils/print_env.py
      - name: Show installed libraries and their versions
        working-directory: /transformers
        run: pip freeze
      - name: Run all tests on GPU
        working-directory: /transformers
        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended -m "not not_device_test"
      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: cat /transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
          name: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
          path: /transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
  send_results:
    name: Slack Report
    needs: [
      check_runner_status,
      check_runners,
      setup,
      run_models_gpu,
      run_pipelines_torch_gpu,
      run_examples_gpu,
      run_torch_cuda_extensions_gpu
    ]
    if: ${{ always() }}
    uses: ./.github/workflows/slack-report.yml
    with:
      job: ${{ inputs.job }}
      # This would be `skipped` if `setup` is skipped.
      setup_status: ${{ needs.setup.result }}
      slack_report_channel: ${{ inputs.slack_report_channel }}
      # This would be an empty string if `setup` is skipped.
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
      ci_event: ${{ inputs.ci_event }}
    secrets: inherit
--- a/.github/workflows/slack-report.yml
+++ b/.github/workflows/slack-report.yml
@@ -70,7 +70,7 @@ jobs:
        with:
          name: ci_results_${{ inputs.job }}
          path: ci_results_${{ inputs.job }}
-      
+
      - uses: actions/checkout@v4
      - uses: actions/download-artifact@v4
      - name: Send message to Slack for quantization workflow
@@ -90,7 +90,7 @@ jobs:
          pip install huggingface_hub
          pip install slack_sdk
          pip show slack_sdk
-          python utils/notification_service_quantization.py "${{ inputs.quantization_matrix }}" 
+          python utils/notification_service_quantization.py "${{ inputs.quantization_matrix }}"
      # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
      - name: Failure table artifacts
@@ -98,4 +98,4 @@ jobs:
        uses: actions/upload-artifact@v4
        with:
          name: ci_results_${{ inputs.job }}
-          path: ci_results_${{ inputs.job }}
+          path: ci_results_${{ inputs.job }}