From 984ff89e7306ad33c46f76afc9aa78d40a8c01d8 Mon Sep 17 00:00:00 2001
From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Date: Mon, 23 Jun 2025 10:56:51 +0200
Subject: [PATCH] Gaudi3 CI (#38790)

---
 .github/workflows/model_jobs_intel_gaudi.yml  | 121 ++++++
 .../workflows/self-scheduled-intel-gaudi.yml  | 345 ++++++++++++++++++
 .../self-scheduled-intel-gaudi3-caller.yml    |  67 ++++
 src/transformers/data/processors/squad.py     |  30 +-
 src/transformers/testing_utils.py             |   3 +
 src/transformers/utils/import_utils.py        |  22 +-
 tests/deepspeed/test_deepspeed.py             |   8 +
 tests/deepspeed/test_model_zoo.py             |   2 +
 tests/fsdp/test_fsdp.py                       |   2 +
 tests/test_modeling_common.py                 |   7 +
 tests/trainer/test_trainer.py                 |   4 +-
 tests/trainer/test_trainer_distributed.py     |   7 +-
 .../trainer/test_trainer_distributed_loss.py  |   2 +
 .../test_trainer_distributed_worker_seed.py   |   2 +
 tests/utils/test_modeling_utils.py            |   2 +
 utils/print_env.py                            |   8 +-
 16 files changed, 618 insertions(+), 14 deletions(-)
 create mode 100644 .github/workflows/model_jobs_intel_gaudi.yml
 create mode 100644 .github/workflows/self-scheduled-intel-gaudi.yml
 create mode 100644 .github/workflows/self-scheduled-intel-gaudi3-caller.yml

diff --git a/.github/workflows/model_jobs_intel_gaudi.yml b/.github/workflows/model_jobs_intel_gaudi.yml
new file mode 100644
index 0000000000..73ff2ba269
--- /dev/null
+++ b/.github/workflows/model_jobs_intel_gaudi.yml
@@ -0,0 +1,121 @@
+name: model jobs
+
+on:
+  workflow_call:
+    inputs:
+      folder_slices:
+        required: true
+        type: string
+      slice_id:
+        required: true
+        type: number
+      runner:
+        required: true
+        type: string
+      machine_type:
+        required: true
+        type: string
+      report_name_prefix:
+        required: false
+        default: run_models_gpu
+        type: string
+
+env:
+  RUN_SLOW: yes
+  PT_HPU_LAZY_MODE: 0
+  TRANSFORMERS_IS_CI: yes
+  PT_ENABLE_INT64_SUPPORT: 1
+  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
+  HF_HOME: /mnt/cache/.cache/huggingface
+
+jobs:
+  run_models_gpu:
+    name: " "
+    strategy:
+      max-parallel: 8
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
+    runs-on:
+      group: ${{ inputs.runner }}
+    container:
+      image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+      options: --runtime=habana
+        -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
+        --env OMPI_MCA_btl_vader_single_copy_mechanism=none
+        --env HABANA_VISIBLE_DEVICES
+        --env HABANA_VISIBLE_MODULES
+        --cap-add=sys_nice
+        --shm-size=64G
+    steps:
+      - name: Echo input and matrix info
+        shell: bash
+        run: |
+          echo "${{ inputs.folder_slices }}"
+          echo "${{ matrix.folders }}"
+          echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
+
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        run: |
+          echo "${{ matrix.folders }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Install dependencies
+        run: |
+          pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn
+
+      - name: HL-SMI
+        run: |
+          hl-smi
+          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
+          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
+
+      - name: Environment
+        run: python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        run: pip freeze
+
+      - name: Set `machine_type` for report and artifact names
+        shell: bash
+        run: |
+          if [ "${{ inputs.machine_type }}" = "1gaudi" ]; then
+            machine_type=single-gpu
+          elif [ "${{ inputs.machine_type }}" = "2gaudi" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ inputs.machine_type }}
+          fi
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Run all tests on Gaudi
+        run: python3 -m pytest -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt
+
+      - name: Run test
+        shell: bash
+        run: |
+          mkdir -p reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
+          echo "hello" > reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt
+          echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports"
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
+          path: reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
diff --git a/.github/workflows/self-scheduled-intel-gaudi.yml b/.github/workflows/self-scheduled-intel-gaudi.yml
new file mode 100644
index 0000000000..2db5ece064
--- /dev/null
+++ b/.github/workflows/self-scheduled-intel-gaudi.yml
@@ -0,0 +1,345 @@
+name: Self-hosted runner (scheduled-intel-gaudi)
+
+on:
+  workflow_call:
+    inputs:
+      job:
+        required: true
+        type: string
+      slack_report_channel:
+        required: true
+        type: string
+      runner_scale_set:
+        required: true
+        type: string
+      ci_event:
+        required: true
+        type: string
+      report_repo_id:
+        required: true
+        type: string
+
+env:
+  NUM_SLICES: 2
+  RUN_SLOW: yes
+  PT_HPU_LAZY_MODE: 0
+  TRANSFORMERS_IS_CI: yes
+  PT_ENABLE_INT64_SUPPORT: 1
+  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
+  HF_HOME: /mnt/cache/.cache/huggingface
+
+jobs:
+  setup:
+    if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
+    name: Setup
+    runs-on: ubuntu-latest
+    outputs:
+      slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
+      folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
+      quantization_matrix: ${{ steps.set-matrix.outputs.quantization_matrix }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - id: set-matrix
+        if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
+        name: Identify models to test
+        working-directory: tests
+        run: |
+          if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
+            echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
+            echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
+          elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
+            echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
+            echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
+          fi
+
+      - id: set-matrix-quantization
+        if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
+        name: Identify quantization method to test
+        working-directory: tests
+        run: |
+          echo "quantization_matrix=$(python3 -c 'import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ;  print(d)')" >> $GITHUB_OUTPUT
+
+  run_models_gpu:
+    if: ${{ inputs.job == 'run_models_gpu' }}
+    name: " "
+    needs: setup
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [1gaudi, 2gaudi]
+        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
+    uses: ./.github/workflows/model_jobs_intel_gaudi.yml
+    with:
+      slice_id: ${{ matrix.slice_id }}
+      machine_type: ${{ matrix.machine_type }}
+      folder_slices: ${{ needs.setup.outputs.folder_slices }}
+      runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
+      report_name_prefix: run_models_gpu
+
+    secrets: inherit
+
+  run_trainer_and_fsdp_gpu:
+    if: ${{ inputs.job == 'run_trainer_and_fsdp_gpu' }}
+    name: " "
+    needs: setup
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [1gaudi, 2gaudi]
+        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
+    uses: ./.github/workflows/model_jobs_intel_gaudi.yml
+    with:
+      slice_id: ${{ matrix.slice_id }}
+      machine_type: ${{ matrix.machine_type }}
+      folder_slices: ${{ needs.setup.outputs.folder_slices }}
+      runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
+      report_name_prefix: run_trainer_and_fsdp_gpu
+
+    secrets: inherit
+
+  run_pipelines_gpu:
+    if: ${{ inputs.job == 'run_pipelines_gpu' }}
+    name: Pipelines
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [1gaudi, 2gaudi]
+    runs-on:
+      group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
+    container:
+      image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+      options: --runtime=habana
+        -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
+        --env OMPI_MCA_btl_vader_single_copy_mechanism=none
+        --env HABANA_VISIBLE_DEVICES
+        --env HABANA_VISIBLE_MODULES
+        --cap-add=sys_nice
+        --shm-size=64G
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Install dependencies
+        run: |
+          pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
+
+      - name: HL-SMI
+        run: |
+          hl-smi
+          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
+          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
+
+      - name: Environment
+        run: python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        run: pip freeze
+
+      - name: Set `machine_type` for report and artifact names
+        shell: bash
+        run: |
+          if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Run all pipeline tests on Intel Gaudi
+        run: |
+          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_pipelines_gpu_test_reports tests/pipelines -m "not not_device_test"
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: |
+          cat reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_gpu_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_pipelines_gpu_test_reports
+          path: reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports
+
+  run_examples_gpu:
+    if: ${{ inputs.job == 'run_examples_gpu' }}
+    name: Examples directory
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [1gaudi]
+    runs-on:
+      group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
+    container:
+      image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+      options: --runtime=habana
+        -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
+        --env OMPI_MCA_btl_vader_single_copy_mechanism=none
+        --env HABANA_VISIBLE_DEVICES
+        --env HABANA_VISIBLE_MODULES
+        --cap-add=sys_nice
+        --shm-size=64G
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Install dependencies
+        run: |
+          pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
+
+      - name: HL-SMI
+        run: |
+          hl-smi
+          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
+          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
+
+      - name: Environment
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        run: |
+          pip freeze
+
+      - name: Set `machine_type` for report and artifact names
+        shell: bash
+        run: |
+          if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Run examples tests on Intel Gaudi
+        run: |
+          pip install -r examples/pytorch/_tests_requirements.txt
+          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_examples_gpu_test_reports examples/pytorch -m "not not_device_test"
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: |
+          cat reports/${{ env.machine_type }}_run_examples_gpu_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_examples_gpu_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_examples_gpu_test_reports
+          path: reports/${{ env.machine_type }}_run_examples_gpu_test_reports
+
+  run_deepspeed_gpu:
+    if: ${{ inputs.job == 'run_deepspeed_gpu' }}
+    name: Intel Gaudi deepspeed tests
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [1gaudi, 2gaudi]
+    runs-on:
+      group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
+    container:
+      image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+      options: --runtime=habana
+        -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
+        --env OMPI_MCA_btl_vader_single_copy_mechanism=none
+        --env HABANA_VISIBLE_DEVICES
+        --env HABANA_VISIBLE_MODULES
+        --cap-add=sys_nice
+        --shm-size=64G
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Install dependencies
+        run: |
+          pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
+          pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0
+
+      - name: HL-SMI
+        run: |
+          hl-smi
+          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
+          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
+
+      - name: Environment
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        run: |
+          pip freeze
+
+      - name: Set `machine_type` for report and artifact names
+        shell: bash
+        run: |
+          if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Run all deepspeed tests on intel Gaudi
+        run: |
+          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_deepspeed_gpu_test_reports tests/deepspeed -m "not not_device_test"
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: |
+          cat reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports
+          path: reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports
+
+  send_results:
+    name: Slack Report
+    needs:
+      [
+        setup,
+        run_models_gpu,
+        run_examples_gpu,
+        run_pipelines_gpu,
+        run_deepspeed_gpu,
+        run_trainer_and_fsdp_gpu,
+      ]
+    if: ${{ always() }}
+    uses: ./.github/workflows/slack-report.yml
+    with:
+      job: ${{ inputs.job }}
+      setup_status: ${{ needs.setup.result }}
+      slack_report_channel: ${{ inputs.slack_report_channel }}
+      quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
+      folder_slices: ${{ needs.setup.outputs.folder_slices }}
+      report_repo_id: ${{ inputs.report_repo_id }}
+      ci_event: ${{ inputs.ci_event }}
+
+    secrets: inherit
diff --git a/.github/workflows/self-scheduled-intel-gaudi3-caller.yml b/.github/workflows/self-scheduled-intel-gaudi3-caller.yml
new file mode 100644
index 0000000000..83cb89290d
--- /dev/null
+++ b/.github/workflows/self-scheduled-intel-gaudi3-caller.yml
@@ -0,0 +1,67 @@
+name: Self-hosted runner (Intel Gaudi3 scheduled CI caller)
+
+on:
+  repository_dispatch:
+  workflow_dispatch:
+  schedule:
+    - cron: "17 2 * * *"
+
+jobs:
+  model-ci:
+    name: Model CI
+    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
+    with:
+      job: run_models_gpu
+      ci_event: Scheduled CI (Intel) - Gaudi3
+      runner_scale_set: itac-bm-emr-gaudi3-dell
+      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
+      report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
+
+    secrets: inherit
+
+  pipeline-ci:
+    name: Pipeline CI
+    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
+    with:
+      job: run_pipelines_gpu
+      ci_event: Scheduled CI (Intel) - Gaudi3
+      runner_scale_set: itac-bm-emr-gaudi3-dell
+      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
+      report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
+
+    secrets: inherit
+
+  example-ci:
+    name: Example CI
+    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
+    with:
+      job: run_examples_gpu
+      ci_event: Scheduled CI (Intel) - Gaudi3
+      runner_scale_set: itac-bm-emr-gaudi3-dell
+      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
+      report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
+
+    secrets: inherit
+
+  deepspeed-ci:
+    name: DeepSpeed CI
+    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
+    with:
+      job: run_deepspeed_gpu
+      ci_event: Scheduled CI (Intel) - Gaudi3
+      runner_scale_set: itac-bm-emr-gaudi3-dell
+      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
+      report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
+
+    secrets: inherit
+
+  trainer-fsdp-ci:
+    name: Trainer/FSDP CI
+    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
+    with:
+      job: run_trainer_and_fsdp_gpu
+      ci_event: Scheduled CI (Intel) - Gaudi3
+      runner_scale_set: itac-bm-emr-gaudi3-dell
+      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
+      report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
+    secrets: inherit
diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py
index 5f3cd0fd28..4a1b44146c 100644
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -23,7 +23,7 @@ from tqdm import tqdm
 
 from ...models.bert.tokenization_bert import whitespace_tokenize
 from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
-from ...utils import is_tf_available, is_torch_available, logging
+from ...utils import is_tf_available, is_torch_available, is_torch_hpu_available, logging
 from .utils import DataProcessor
 
 
@@ -361,11 +361,29 @@ def squad_convert_examples_to_features(
         is_training=not evaluate,
     )
     ```"""
-    # Defining helper methods
-    features = []
 
-    threads = min(threads, cpu_count())
-    with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
+    if not is_torch_hpu_available():
+        threads = min(threads, cpu_count())
+        with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
+            annotate_ = partial(
+                squad_convert_example_to_features,
+                max_seq_length=max_seq_length,
+                doc_stride=doc_stride,
+                max_query_length=max_query_length,
+                padding_strategy=padding_strategy,
+                is_training=is_training,
+            )
+            features = list(
+                tqdm(
+                    p.imap(annotate_, examples, chunksize=32),
+                    total=len(examples),
+                    desc="convert squad examples to features",
+                    disable=not tqdm_enabled,
+                )
+            )
+    else:
+        # Non-parallel version for hpu https://github.com/huggingface/transformers/pull/38790#discussion_r2156470902
+        squad_convert_example_to_features_init(tokenizer_for_convert=tokenizer)
         annotate_ = partial(
             squad_convert_example_to_features,
             max_seq_length=max_seq_length,
@@ -376,7 +394,7 @@ def squad_convert_examples_to_features(
         )
         features = list(
             tqdm(
-                p.imap(annotate_, examples, chunksize=32),
+                map(annotate_, examples),
                 total=len(examples),
                 desc="convert squad examples to features",
                 disable=not tqdm_enabled,
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 8f7aedb625..7c3c6ccac8 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -3007,6 +3007,9 @@ class HfDoctestModule(Module):
 
 def _device_agnostic_dispatch(device: str, dispatch_table: dict[str, Callable], *args, **kwargs):
     if device not in dispatch_table:
+        if not callable(dispatch_table["default"]):
+            return dispatch_table["default"]
+
         return dispatch_table["default"](*args, **kwargs)
 
     fn = dispatch_table[device]
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 6ae1b36d0b..a933c9638d 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -815,8 +815,8 @@ def is_torch_hpu_available():
     ):
         return False
 
-    torch_hpu_min_version = "1.5.0"
-    if _accelerate_available and version.parse(_accelerate_version) < version.parse(torch_hpu_min_version):
+    torch_hpu_min_accelerate_version = "1.5.0"
+    if _accelerate_available and version.parse(_accelerate_version) < version.parse(torch_hpu_min_accelerate_version):
         return False
 
     import torch
@@ -850,6 +850,24 @@ def is_torch_hpu_available():
 
         torch.Tensor.masked_fill_ = patched_masked_fill_
 
+    # IlyasMoutawwakil: we patch torch.compile to use the HPU backend by default
+    # https://github.com/huggingface/transformers/pull/38790#discussion_r2157043944
+    # This is necessary for cases where torch.compile is used as a decorator (defaulting to inductor)
+    # https://github.com/huggingface/transformers/blob/af6120b3eb2470b994c21421bb6eaa76576128b0/src/transformers/models/modernbert/modeling_modernbert.py#L204
+    original_compile = torch.compile
+
+    def hpu_backend_compile(*args, **kwargs):
+        if kwargs.get("backend", None) not in ["hpu_backend", "eager"]:
+            logger.warning(
+                f"Calling torch.compile with backend={kwargs.get('backend', None)} on a Gaudi device is not supported. "
+                "We will override the backend with 'hpu_backend' to avoid errors."
+            )
+            kwargs["backend"] = "hpu_backend"
+
+        return original_compile(*args, **kwargs)
+
+    torch.compile = hpu_backend_compile
+
     return True
 
 
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index f07756f731..1a4966b09e 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -1134,10 +1134,12 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
 
     @parameterized.expand(params, name_func=parameterized_custom_name_func)
     @require_torch_multi_accelerator
+    @run_first
     def test_basic_distributed(self, stage, dtype):
         self.run_and_check(stage=stage, dtype=dtype, distributed=True)
 
     @require_torch_fp16
+    @run_first
     def test_do_eval_no_train(self):
         # testing only zero3 since zero2 makes no sense with inference
         self.run_and_check(
@@ -1150,6 +1152,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
         )
 
     @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    @run_first
     def test_fp32_non_distributed(self, stage, dtype):
         # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
         # therefore no quality checks, just basic completion checks are done
@@ -1166,6 +1169,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
 
     @parameterized.expand(params, name_func=parameterized_custom_name_func)
     @require_torch_multi_accelerator
+    @run_first
     def test_fp32_distributed(self, stage, dtype):
         # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
         # therefore no quality checks, just basic completion checks are done
@@ -1181,6 +1185,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
         )
 
     @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    @run_first
     def test_resume_train_not_from_ds_checkpoint(self, stage, dtype):
         # do normal training and then resume not from the deepspeed checkpoint but explicitly from
         # the saved model dir
@@ -1207,6 +1212,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
 
     @parameterized.expand(["bf16", "fp16", "fp32"])
     @require_torch_multi_accelerator
+    @run_first
     def test_inference(self, dtype):
         if dtype == "bf16" and not is_torch_bf16_available_on_device(torch_device):
             self.skipTest(reason="test requires bfloat16 hardware support")
@@ -1361,6 +1367,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
         return output_dir
 
     @parameterized.expand(params, name_func=parameterized_custom_name_func)
+    @run_first
     def test_clm(self, stage, dtype):
         # this test exercises model.resize_token_embeddings() which requires param gathering outside
         # of forward - it's not used by `run_translation.py`, but it is in `run_clm.py`
@@ -1397,6 +1404,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
         execute_subprocess_async(cmd, env=self.get_env())
 
     @require_torch_fp16
+    @run_first
     def test_clm_from_config_zero3_fp16(self):
         # this test exercises AutoModel.from_config(config) - to ensure zero.Init is called
 
diff --git a/tests/deepspeed/test_model_zoo.py b/tests/deepspeed/test_model_zoo.py
index dba3e18abb..b2c277b862 100644
--- a/tests/deepspeed/test_model_zoo.py
+++ b/tests/deepspeed/test_model_zoo.py
@@ -28,6 +28,7 @@ from transformers.testing_utils import (
     get_tests_dir,
     require_deepspeed,
     require_torch_accelerator,
+    run_first,
     slow,
     torch_device,
 )
@@ -327,6 +328,7 @@ params = list(itertools.product(stages, task_cmds.keys()))
 
 
 @slow
+@run_first
 @require_deepspeed
 @require_torch_accelerator
 class TestDeepSpeedModelZoo(TestCasePlus):
diff --git a/tests/fsdp/test_fsdp.py b/tests/fsdp/test_fsdp.py
index 781199747f..a932a1fbac 100644
--- a/tests/fsdp/test_fsdp.py
+++ b/tests/fsdp/test_fsdp.py
@@ -358,6 +358,7 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
             raise AssertionError("CPU offloading failed with FSDP!")
 
     @require_torch_multi_accelerator
+    @run_first
     @slow
     @require_fsdp_v2_version
     @require_accelerate_fsdp2
@@ -405,6 +406,7 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
                 self.assertAlmostEqual(log["learning_rate"], log1["learning_rate"], delta=1e-5)
 
     @require_torch_multi_accelerator
+    @run_first
     @slow
     @require_fsdp
     @require_fsdp_v2_version
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 4e2555b57e..874323b1e9 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -84,6 +84,7 @@ from transformers.testing_utils import (
     require_bitsandbytes,
     require_deepspeed,
     require_flash_attn,
+    require_non_hpu,
     require_safetensors,
     require_torch,
     require_torch_accelerator,
@@ -92,6 +93,7 @@ from transformers.testing_utils import (
     require_torch_multi_accelerator,
     require_torch_multi_gpu,
     require_torch_sdpa,
+    run_first,
     run_test_using_subprocess,
     set_config_for_less_flaky_test,
     set_model_for_less_flaky_test,
@@ -2797,6 +2799,7 @@ class ModelTesterMixin:
                     else:
                         torch.testing.assert_close(base_output[0], new_output[0], rtol=1e-5, atol=1e-5)
 
+    @require_non_hpu
     @require_accelerate
     @mark.accelerate_tests
     @require_torch_multi_accelerator
@@ -3727,6 +3730,9 @@ class ModelTesterMixin:
                 if torch_device in ["cpu", "cuda"]:
                     atol = atols[torch_device, enable_kernels, torch_dtype]
                     rtol = rtols[torch_device, enable_kernels, torch_dtype]
+                elif torch_device == "hpu":
+                    atol = atols["cuda", enable_kernels, torch_dtype]
+                    rtol = rtols["cuda", enable_kernels, torch_dtype]
                 elif torch_device == "xpu":
                     # As of PyTorch 2.5 XPU backend supports only torch.nn.attention.SDPBackend.MATH
                     # which is implemented on PyTorch level using aten operators and is
@@ -4666,6 +4672,7 @@ class ModelTesterMixin:
 
     # Here we need to run with a subprocess as otherwise setting back the default device to the default value ("cpu")
     # may bring unwanted consequences on other tests. See PR #37553
+    @run_first
     @run_test_using_subprocess
     @require_torch_accelerator
     def test_can_load_with_global_device_set(self):
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 2594edcdef..878940b937 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -3062,6 +3062,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
     # the test slower.
     @require_torch_non_multi_accelerator
     @run_test_using_subprocess
+    @run_first
     @slow
     def test_can_resume_training_lm(self):
         # Check if it works for a simple language modeling example
@@ -3517,7 +3518,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
                 )
 
     @slow
-    @run_first
     def test_trainer_eval_mrpc(self):
         MODEL_ID = "google-bert/bert-base-cased-finetuned-mrpc"
         tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -3534,7 +3534,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
             self.assertLess(result["eval_loss"], 0.2)
 
     @slow
-    @run_first
     def test_trainer_eval_multiple(self):
         MODEL_ID = "openai-community/gpt2"
         tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -4125,6 +4124,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
             self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params)
 
     @slow
+    @run_first
     @require_non_hpu
     @require_torch_multi_accelerator
     def test_end_to_end_example(self):
diff --git a/tests/trainer/test_trainer_distributed.py b/tests/trainer/test_trainer_distributed.py
index 17f7bee23b..54568caac3 100644
--- a/tests/trainer/test_trainer_distributed.py
+++ b/tests/trainer/test_trainer_distributed.py
@@ -22,6 +22,7 @@ from transformers.testing_utils import (
     execute_subprocess_async,
     get_torch_dist_unique_port,
     require_torch_multi_accelerator,
+    run_first,
     torch_device,
 )
 from transformers.training_args import ParallelMode
@@ -116,6 +117,7 @@ if is_torch_available():
 
 
 class TestTrainerDistributed(TestCasePlus):
+    @run_first
     @require_torch_multi_accelerator
     def test_trainer(self):
         distributed_args = f"""--nproc_per_node={backend_device_count(torch_device)}
@@ -199,8 +201,7 @@ if __name__ == "__main__":
     model = RegressionModel()
     training_args.per_device_train_batch_size = 1
     training_args.max_steps = 1
-    training_args.accelerator_config = {
-        "dispatch_batches": False,
-    }
+    training_args.accelerator_config.dispatch_batches = False
+
     trainer = Trainer(model, training_args, train_dataset=train_dataset)
     trainer.train()
diff --git a/tests/trainer/test_trainer_distributed_loss.py b/tests/trainer/test_trainer_distributed_loss.py
index 405763125e..93cc042fe6 100644
--- a/tests/trainer/test_trainer_distributed_loss.py
+++ b/tests/trainer/test_trainer_distributed_loss.py
@@ -18,11 +18,13 @@ from transformers.testing_utils import (
     execute_subprocess_async,
     get_torch_dist_unique_port,
     require_torch_multi_accelerator,
+    run_first,
     torch_device,
 )
 
 
 class TestTrainerDistributedLoss(TestCasePlus):
+    @run_first
     @require_torch_multi_accelerator
     def test_trainer(self):
         device_count = backend_device_count(torch_device)
diff --git a/tests/trainer/test_trainer_distributed_worker_seed.py b/tests/trainer/test_trainer_distributed_worker_seed.py
index 3fa625af74..ded0382677 100644
--- a/tests/trainer/test_trainer_distributed_worker_seed.py
+++ b/tests/trainer/test_trainer_distributed_worker_seed.py
@@ -18,6 +18,7 @@ from transformers.testing_utils import (
     execute_subprocess_async,
     get_torch_dist_unique_port,
     require_torch_multi_accelerator,
+    run_first,
     torch_device,
 )
 
@@ -57,6 +58,7 @@ class DummyModel(nn.Module):
 
 
 class TestTrainerDistributedWorkerSeed(TestCasePlus):
+    @run_first
     @require_torch_multi_accelerator
     def test_trainer(self):
         device_count = backend_device_count(torch_device)
diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py
index 92a38baf94..903283dd4a 100644
--- a/tests/utils/test_modeling_utils.py
+++ b/tests/utils/test_modeling_utils.py
@@ -58,6 +58,7 @@ from transformers.testing_utils import (
     is_staging_test,
     require_accelerate,
     require_flax,
+    require_non_hpu,
     require_read_token,
     require_safetensors,
     require_tf,
@@ -1002,6 +1003,7 @@ class ModelUtilsTest(TestCasePlus):
 
         self.assertIsNotNone(model)
 
+    @require_non_hpu
     @require_accelerate
     @mark.accelerate_tests
     @require_torch_multi_accelerator
diff --git a/utils/print_env.py b/utils/print_env.py
index 04ea99947e..e6d54fff2c 100644
--- a/utils/print_env.py
+++ b/utils/print_env.py
@@ -21,7 +21,7 @@ import os
 import sys
 
 import transformers
-from transformers import is_torch_xpu_available
+from transformers import is_torch_hpu_available, is_torch_xpu_available
 
 
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
@@ -38,6 +38,9 @@ try:
         accelerator = "CUDA"
     elif is_torch_xpu_available():
         accelerator = "XPU"
+    elif is_torch_hpu_available():
+        accelerator = "HPU"
+
     print("Torch accelerator:", accelerator)
 
     if accelerator == "CUDA":
@@ -48,6 +51,9 @@ try:
     elif accelerator == "XPU":
         print("SYCL version:", torch.version.xpu)
         print("Number of XPUs available:", torch.xpu.device_count())
+    elif accelerator == "HPU":
+        print("HPU version:", torch.__version__.split("+")[-1])
+        print("Number of HPUs available:", torch.hpu.device_count())
 except ImportError:
     print("Torch version:", None)