Gaudi3 CI (#38790)

2025-06-23 10:56:51 +02:00
parent 2166b6b4ff
commit 984ff89e73
16 changed files with 618 additions and 14 deletions
--- a/.github/workflows/model_jobs_intel_gaudi.yml
+++ b/.github/workflows/model_jobs_intel_gaudi.yml
@@ -0,0 +1,121 @@
 name: model jobs
 on:
  workflow_call:
    inputs:
      folder_slices:
        required: true
        type: string
      slice_id:
        required: true
        type: number
      runner:
        required: true
        type: string
      machine_type:
        required: true
        type: string
      report_name_prefix:
        required: false
        default: run_models_gpu
        type: string
 env:
  RUN_SLOW: yes
  PT_HPU_LAZY_MODE: 0
  TRANSFORMERS_IS_CI: yes
  PT_ENABLE_INT64_SUPPORT: 1
  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
  HF_HOME: /mnt/cache/.cache/huggingface
 jobs:
  run_models_gpu:
    name: " "
    strategy:
      max-parallel: 8
      fail-fast: false
      matrix:
        folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
    runs-on:
      group: ${{ inputs.runner }}
    container:
      image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
      options: --runtime=habana
        -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
        --env OMPI_MCA_btl_vader_single_copy_mechanism=none
        --env HABANA_VISIBLE_DEVICES
        --env HABANA_VISIBLE_MODULES
        --cap-add=sys_nice
        --shm-size=64G
    steps:
      - name: Echo input and matrix info
        shell: bash
        run: |
          echo "${{ inputs.folder_slices }}"
          echo "${{ matrix.folders }}"
          echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
      - name: Echo folder ${{ matrix.folders }}
        shell: bash
        run: |
          echo "${{ matrix.folders }}"
          matrix_folders=${{ matrix.folders }}
          matrix_folders=${matrix_folders/'models/'/'models_'}
          echo "$matrix_folders"
          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
      - name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Install dependencies
        run: |
          pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn
      - name: HL-SMI
        run: |
          hl-smi
          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
      - name: Environment
        run: python3 utils/print_env.py
      - name: Show installed libraries and their versions
        run: pip freeze
      - name: Set `machine_type` for report and artifact names
        shell: bash
        run: |
          if [ "${{ inputs.machine_type }}" = "1gaudi" ]; then
            machine_type=single-gpu
          elif [ "${{ inputs.machine_type }}" = "2gaudi" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ inputs.machine_type }}
          fi
          echo "machine_type=$machine_type" >> $GITHUB_ENV
      - name: Run all tests on Gaudi
        run: python3 -m pytest -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: cat reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt
      - name: Run test
        shell: bash
        run: |
          mkdir -p reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
          echo "hello" > reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt
          echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports"
      - name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
          name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
          path: reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
--- a/.github/workflows/self-scheduled-intel-gaudi.yml
+++ b/.github/workflows/self-scheduled-intel-gaudi.yml
@@ -0,0 +1,345 @@
 name: Self-hosted runner (scheduled-intel-gaudi)
 on:
  workflow_call:
    inputs:
      job:
        required: true
        type: string
      slack_report_channel:
        required: true
        type: string
      runner_scale_set:
        required: true
        type: string
      ci_event:
        required: true
        type: string
      report_repo_id:
        required: true
        type: string
 env:
  NUM_SLICES: 2
  RUN_SLOW: yes
  PT_HPU_LAZY_MODE: 0
  TRANSFORMERS_IS_CI: yes
  PT_ENABLE_INT64_SUPPORT: 1
  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
  HF_HOME: /mnt/cache/.cache/huggingface
 jobs:
  setup:
    if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
    name: Setup
    runs-on: ubuntu-latest
    outputs:
      slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
      folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
      quantization_matrix: ${{ steps.set-matrix.outputs.quantization_matrix }}
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.10"
      - id: set-matrix
        if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
        name: Identify models to test
        working-directory: tests
        run: |
          if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
            echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
            echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
          elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
            echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
            echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
          fi
      - id: set-matrix-quantization
        if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
        name: Identify quantization method to test
        working-directory: tests
        run: |
          echo "quantization_matrix=$(python3 -c 'import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ;  print(d)')" >> $GITHUB_OUTPUT
  run_models_gpu:
    if: ${{ inputs.job == 'run_models_gpu' }}
    name: " "
    needs: setup
    strategy:
      fail-fast: false
      matrix:
        machine_type: [1gaudi, 2gaudi]
        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
    uses: ./.github/workflows/model_jobs_intel_gaudi.yml
    with:
      slice_id: ${{ matrix.slice_id }}
      machine_type: ${{ matrix.machine_type }}
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
      report_name_prefix: run_models_gpu
    secrets: inherit
  run_trainer_and_fsdp_gpu:
    if: ${{ inputs.job == 'run_trainer_and_fsdp_gpu' }}
    name: " "
    needs: setup
    strategy:
      fail-fast: false
      matrix:
        machine_type: [1gaudi, 2gaudi]
        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
    uses: ./.github/workflows/model_jobs_intel_gaudi.yml
    with:
      slice_id: ${{ matrix.slice_id }}
      machine_type: ${{ matrix.machine_type }}
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
      report_name_prefix: run_trainer_and_fsdp_gpu
    secrets: inherit
  run_pipelines_gpu:
    if: ${{ inputs.job == 'run_pipelines_gpu' }}
    name: Pipelines
    strategy:
      fail-fast: false
      matrix:
        machine_type: [1gaudi, 2gaudi]
    runs-on:
      group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
    container:
      image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
      options: --runtime=habana
        -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
        --env OMPI_MCA_btl_vader_single_copy_mechanism=none
        --env HABANA_VISIBLE_DEVICES
        --env HABANA_VISIBLE_MODULES
        --cap-add=sys_nice
        --shm-size=64G
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Install dependencies
        run: |
          pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
      - name: HL-SMI
        run: |
          hl-smi
          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
      - name: Environment
        run: python3 utils/print_env.py
      - name: Show installed libraries and their versions
        run: pip freeze
      - name: Set `machine_type` for report and artifact names
        shell: bash
        run: |
          if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
          fi
          echo "machine_type=$machine_type" >> $GITHUB_ENV
      - name: Run all pipeline tests on Intel Gaudi
        run: |
          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_pipelines_gpu_test_reports tests/pipelines -m "not not_device_test"
      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: |
          cat reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports/failures_short.txt
      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_gpu_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
          name: ${{ env.machine_type }}_run_pipelines_gpu_test_reports
          path: reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports
  run_examples_gpu:
    if: ${{ inputs.job == 'run_examples_gpu' }}
    name: Examples directory
    strategy:
      fail-fast: false
      matrix:
        machine_type: [1gaudi]
    runs-on:
      group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
    container:
      image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
      options: --runtime=habana
        -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
        --env OMPI_MCA_btl_vader_single_copy_mechanism=none
        --env HABANA_VISIBLE_DEVICES
        --env HABANA_VISIBLE_MODULES
        --cap-add=sys_nice
        --shm-size=64G
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Install dependencies
        run: |
          pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
      - name: HL-SMI
        run: |
          hl-smi
          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
      - name: Environment
        run: |
          python3 utils/print_env.py
      - name: Show installed libraries and their versions
        run: |
          pip freeze
      - name: Set `machine_type` for report and artifact names
        shell: bash
        run: |
          if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
          fi
          echo "machine_type=$machine_type" >> $GITHUB_ENV
      - name: Run examples tests on Intel Gaudi
        run: |
          pip install -r examples/pytorch/_tests_requirements.txt
          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_examples_gpu_test_reports examples/pytorch -m "not not_device_test"
      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: |
          cat reports/${{ env.machine_type }}_run_examples_gpu_test_reports/failures_short.txt
      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_examples_gpu_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
          name: ${{ env.machine_type }}_run_examples_gpu_test_reports
          path: reports/${{ env.machine_type }}_run_examples_gpu_test_reports
  run_deepspeed_gpu:
    if: ${{ inputs.job == 'run_deepspeed_gpu' }}
    name: Intel Gaudi deepspeed tests
    strategy:
      fail-fast: false
      matrix:
        machine_type: [1gaudi, 2gaudi]
    runs-on:
      group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
    container:
      image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
      options: --runtime=habana
        -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
        --env OMPI_MCA_btl_vader_single_copy_mechanism=none
        --env HABANA_VISIBLE_DEVICES
        --env HABANA_VISIBLE_MODULES
        --cap-add=sys_nice
        --shm-size=64G
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Install dependencies
        run: |
          pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
          pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0
      - name: HL-SMI
        run: |
          hl-smi
          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
      - name: Environment
        run: |
          python3 utils/print_env.py
      - name: Show installed libraries and their versions
        run: |
          pip freeze
      - name: Set `machine_type` for report and artifact names
        shell: bash
        run: |
          if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
          fi
          echo "machine_type=$machine_type" >> $GITHUB_ENV
      - name: Run all deepspeed tests on intel Gaudi
        run: |
          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_deepspeed_gpu_test_reports tests/deepspeed -m "not not_device_test"
      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: |
          cat reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports/failures_short.txt
      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
          name: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports
          path: reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports
  send_results:
    name: Slack Report
    needs:
      [
        setup,
        run_models_gpu,
        run_examples_gpu,
        run_pipelines_gpu,
        run_deepspeed_gpu,
        run_trainer_and_fsdp_gpu,
      ]
    if: ${{ always() }}
    uses: ./.github/workflows/slack-report.yml
    with:
      job: ${{ inputs.job }}
      setup_status: ${{ needs.setup.result }}
      slack_report_channel: ${{ inputs.slack_report_channel }}
      quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      report_repo_id: ${{ inputs.report_repo_id }}
      ci_event: ${{ inputs.ci_event }}
    secrets: inherit
--- a/.github/workflows/self-scheduled-intel-gaudi3-caller.yml
+++ b/.github/workflows/self-scheduled-intel-gaudi3-caller.yml
@@ -0,0 +1,67 @@
 name: Self-hosted runner (Intel Gaudi3 scheduled CI caller)
 on:
  repository_dispatch:
  workflow_dispatch:
  schedule:
    - cron: "17 2 * * *"
 jobs:
  model-ci:
    name: Model CI
    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
    with:
      job: run_models_gpu
      ci_event: Scheduled CI (Intel) - Gaudi3
      runner_scale_set: itac-bm-emr-gaudi3-dell
      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
      report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
    secrets: inherit
  pipeline-ci:
    name: Pipeline CI
    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
    with:
      job: run_pipelines_gpu
      ci_event: Scheduled CI (Intel) - Gaudi3
      runner_scale_set: itac-bm-emr-gaudi3-dell
      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
      report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
    secrets: inherit
  example-ci:
    name: Example CI
    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
    with:
      job: run_examples_gpu
      ci_event: Scheduled CI (Intel) - Gaudi3
      runner_scale_set: itac-bm-emr-gaudi3-dell
      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
      report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
    secrets: inherit
  deepspeed-ci:
    name: DeepSpeed CI
    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
    with:
      job: run_deepspeed_gpu
      ci_event: Scheduled CI (Intel) - Gaudi3
      runner_scale_set: itac-bm-emr-gaudi3-dell
      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
      report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
    secrets: inherit
  trainer-fsdp-ci:
    name: Trainer/FSDP CI
    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
    with:
      job: run_trainer_and_fsdp_gpu
      ci_event: Scheduled CI (Intel) - Gaudi3
      runner_scale_set: itac-bm-emr-gaudi3-dell
      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
      report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
    secrets: inherit
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -23,7 +23,7 @@ from tqdm import tqdm
 from ...models.bert.tokenization_bert import whitespace_tokenize
 from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
-from ...utils import is_tf_available, is_torch_available, logging
+from ...utils import is_tf_available, is_torch_available, is_torch_hpu_available, logging
 from .utils import DataProcessor
@@ -361,11 +361,29 @@ def squad_convert_examples_to_features(
        is_training=not evaluate,
    )
    ```"""
    # Defining helper methods
    features = []
-    threads = min(threads, cpu_count())
+    if not is_torch_hpu_available():
-    with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
+        threads = min(threads, cpu_count())
        with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
            annotate_ = partial(
                squad_convert_example_to_features,
                max_seq_length=max_seq_length,
                doc_stride=doc_stride,
                max_query_length=max_query_length,
                padding_strategy=padding_strategy,
                is_training=is_training,
            )
            features = list(
                tqdm(
                    p.imap(annotate_, examples, chunksize=32),
                    total=len(examples),
                    desc="convert squad examples to features",
                    disable=not tqdm_enabled,
                )
            )
    else:
        # Non-parallel version for hpu https://github.com/huggingface/transformers/pull/38790#discussion_r2156470902
        squad_convert_example_to_features_init(tokenizer_for_convert=tokenizer)
        annotate_ = partial(
            squad_convert_example_to_features,
            max_seq_length=max_seq_length,
@@ -376,7 +394,7 @@ def squad_convert_examples_to_features(
        )
        features = list(
            tqdm(
-                p.imap(annotate_, examples, chunksize=32),
+                map(annotate_, examples),
                total=len(examples),
                desc="convert squad examples to features",
                disable=not tqdm_enabled,
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -3007,6 +3007,9 @@ class HfDoctestModule(Module):
 def _device_agnostic_dispatch(device: str, dispatch_table: dict[str, Callable], *args, **kwargs):
    if device not in dispatch_table:
        if not callable(dispatch_table["default"]):
            return dispatch_table["default"]
        return dispatch_table["default"](*args, **kwargs)
    fn = dispatch_table[device]
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -815,8 +815,8 @@ def is_torch_hpu_available():
    ):
        return False
-    torch_hpu_min_version = "1.5.0"
+    torch_hpu_min_accelerate_version = "1.5.0"
-    if _accelerate_available and version.parse(_accelerate_version) < version.parse(torch_hpu_min_version):
+    if _accelerate_available and version.parse(_accelerate_version) < version.parse(torch_hpu_min_accelerate_version):
        return False
    import torch
@@ -850,6 +850,24 @@ def is_torch_hpu_available():
        torch.Tensor.masked_fill_ = patched_masked_fill_
    # IlyasMoutawwakil: we patch torch.compile to use the HPU backend by default
    # https://github.com/huggingface/transformers/pull/38790#discussion_r2157043944
    # This is necessary for cases where torch.compile is used as a decorator (defaulting to inductor)
    # https://github.com/huggingface/transformers/blob/af6120b3eb2470b994c21421bb6eaa76576128b0/src/transformers/models/modernbert/modeling_modernbert.py#L204
    original_compile = torch.compile
    def hpu_backend_compile(*args, **kwargs):
        if kwargs.get("backend", None) not in ["hpu_backend", "eager"]:
            logger.warning(
                f"Calling torch.compile with backend={kwargs.get('backend', None)} on a Gaudi device is not supported. "
                "We will override the backend with 'hpu_backend' to avoid errors."
            )
            kwargs["backend"] = "hpu_backend"
        return original_compile(*args, **kwargs)
    torch.compile = hpu_backend_compile
    return True
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -1134,10 +1134,12 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
    @parameterized.expand(params, name_func=parameterized_custom_name_func)
    @require_torch_multi_accelerator
    @run_first
    def test_basic_distributed(self, stage, dtype):
        self.run_and_check(stage=stage, dtype=dtype, distributed=True)
    @require_torch_fp16
    @run_first
    def test_do_eval_no_train(self):
        # testing only zero3 since zero2 makes no sense with inference
        self.run_and_check(
@@ -1150,6 +1152,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
        )
    @parameterized.expand(params, name_func=parameterized_custom_name_func)
    @run_first
    def test_fp32_non_distributed(self, stage, dtype):
        # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
        # therefore no quality checks, just basic completion checks are done
@@ -1166,6 +1169,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
    @parameterized.expand(params, name_func=parameterized_custom_name_func)
    @require_torch_multi_accelerator
    @run_first
    def test_fp32_distributed(self, stage, dtype):
        # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
        # therefore no quality checks, just basic completion checks are done
@@ -1181,6 +1185,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
        )
    @parameterized.expand(params, name_func=parameterized_custom_name_func)
    @run_first
    def test_resume_train_not_from_ds_checkpoint(self, stage, dtype):
        # do normal training and then resume not from the deepspeed checkpoint but explicitly from
        # the saved model dir
@@ -1207,6 +1212,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
    @parameterized.expand(["bf16", "fp16", "fp32"])
    @require_torch_multi_accelerator
    @run_first
    def test_inference(self, dtype):
        if dtype == "bf16" and not is_torch_bf16_available_on_device(torch_device):
            self.skipTest(reason="test requires bfloat16 hardware support")
@@ -1361,6 +1367,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
        return output_dir
    @parameterized.expand(params, name_func=parameterized_custom_name_func)
    @run_first
    def test_clm(self, stage, dtype):
        # this test exercises model.resize_token_embeddings() which requires param gathering outside
        # of forward - it's not used by `run_translation.py`, but it is in `run_clm.py`
@@ -1397,6 +1404,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
        execute_subprocess_async(cmd, env=self.get_env())
    @require_torch_fp16
    @run_first
    def test_clm_from_config_zero3_fp16(self):
        # this test exercises AutoModel.from_config(config) - to ensure zero.Init is called
--- a/tests/deepspeed/test_model_zoo.py
+++ b/tests/deepspeed/test_model_zoo.py
@@ -28,6 +28,7 @@ from transformers.testing_utils import (
    get_tests_dir,
    require_deepspeed,
    require_torch_accelerator,
    run_first,
    slow,
    torch_device,
 )
@@ -327,6 +328,7 @@ params = list(itertools.product(stages, task_cmds.keys()))
@slow
@run_first
@require_deepspeed
@require_torch_accelerator
 class TestDeepSpeedModelZoo(TestCasePlus):
--- a/tests/fsdp/test_fsdp.py
+++ b/tests/fsdp/test_fsdp.py
@@ -358,6 +358,7 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
            raise AssertionError("CPU offloading failed with FSDP!")
    @require_torch_multi_accelerator
    @run_first
    @slow
    @require_fsdp_v2_version
    @require_accelerate_fsdp2
@@ -405,6 +406,7 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
                self.assertAlmostEqual(log["learning_rate"], log1["learning_rate"], delta=1e-5)
    @require_torch_multi_accelerator
    @run_first
    @slow
    @require_fsdp
    @require_fsdp_v2_version
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -84,6 +84,7 @@ from transformers.testing_utils import (
    require_bitsandbytes,
    require_deepspeed,
    require_flash_attn,
    require_non_hpu,
    require_safetensors,
    require_torch,
    require_torch_accelerator,
@@ -92,6 +93,7 @@ from transformers.testing_utils import (
    require_torch_multi_accelerator,
    require_torch_multi_gpu,
    require_torch_sdpa,
    run_first,
    run_test_using_subprocess,
    set_config_for_less_flaky_test,
    set_model_for_less_flaky_test,
@@ -2797,6 +2799,7 @@ class ModelTesterMixin:
                    else:
                        torch.testing.assert_close(base_output[0], new_output[0], rtol=1e-5, atol=1e-5)
    @require_non_hpu
    @require_accelerate
    @mark.accelerate_tests
    @require_torch_multi_accelerator
@@ -3727,6 +3730,9 @@ class ModelTesterMixin:
                if torch_device in ["cpu", "cuda"]:
                    atol = atols[torch_device, enable_kernels, torch_dtype]
                    rtol = rtols[torch_device, enable_kernels, torch_dtype]
                elif torch_device == "hpu":
                    atol = atols["cuda", enable_kernels, torch_dtype]
                    rtol = rtols["cuda", enable_kernels, torch_dtype]
                elif torch_device == "xpu":
                    # As of PyTorch 2.5 XPU backend supports only torch.nn.attention.SDPBackend.MATH
                    # which is implemented on PyTorch level using aten operators and is
@@ -4666,6 +4672,7 @@ class ModelTesterMixin:
    # Here we need to run with a subprocess as otherwise setting back the default device to the default value ("cpu")
    # may bring unwanted consequences on other tests. See PR #37553
    @run_first
    @run_test_using_subprocess
    @require_torch_accelerator
    def test_can_load_with_global_device_set(self):
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -3062,6 +3062,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
    # the test slower.
    @require_torch_non_multi_accelerator
    @run_test_using_subprocess
    @run_first
    @slow
    def test_can_resume_training_lm(self):
        # Check if it works for a simple language modeling example
@@ -3517,7 +3518,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
                )
    @slow
    @run_first
    def test_trainer_eval_mrpc(self):
        MODEL_ID = "google-bert/bert-base-cased-finetuned-mrpc"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -3534,7 +3534,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
            self.assertLess(result["eval_loss"], 0.2)
    @slow
    @run_first
    def test_trainer_eval_multiple(self):
        MODEL_ID = "openai-community/gpt2"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
@@ -4125,6 +4124,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
            self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params)
    @slow
    @run_first
    @require_non_hpu
    @require_torch_multi_accelerator
    def test_end_to_end_example(self):
--- a/tests/trainer/test_trainer_distributed.py
+++ b/tests/trainer/test_trainer_distributed.py
@@ -22,6 +22,7 @@ from transformers.testing_utils import (
    execute_subprocess_async,
    get_torch_dist_unique_port,
    require_torch_multi_accelerator,
    run_first,
    torch_device,
 )
 from transformers.training_args import ParallelMode
@@ -116,6 +117,7 @@ if is_torch_available():
 class TestTrainerDistributed(TestCasePlus):
    @run_first
    @require_torch_multi_accelerator
    def test_trainer(self):
        distributed_args = f"""--nproc_per_node={backend_device_count(torch_device)}
@@ -199,8 +201,7 @@ if __name__ == "__main__":
    model = RegressionModel()
    training_args.per_device_train_batch_size = 1
    training_args.max_steps = 1
-    training_args.accelerator_config = {
+    training_args.accelerator_config.dispatch_batches = False
-        "dispatch_batches": False,
+
    }
    trainer = Trainer(model, training_args, train_dataset=train_dataset)
    trainer.train()
--- a/tests/trainer/test_trainer_distributed_loss.py
+++ b/tests/trainer/test_trainer_distributed_loss.py
@@ -18,11 +18,13 @@ from transformers.testing_utils import (
    execute_subprocess_async,
    get_torch_dist_unique_port,
    require_torch_multi_accelerator,
    run_first,
    torch_device,
 )
 class TestTrainerDistributedLoss(TestCasePlus):
    @run_first
    @require_torch_multi_accelerator
    def test_trainer(self):
        device_count = backend_device_count(torch_device)
--- a/tests/trainer/test_trainer_distributed_worker_seed.py
+++ b/tests/trainer/test_trainer_distributed_worker_seed.py
@@ -18,6 +18,7 @@ from transformers.testing_utils import (
    execute_subprocess_async,
    get_torch_dist_unique_port,
    require_torch_multi_accelerator,
    run_first,
    torch_device,
 )
@@ -57,6 +58,7 @@ class DummyModel(nn.Module):
 class TestTrainerDistributedWorkerSeed(TestCasePlus):
    @run_first
    @require_torch_multi_accelerator
    def test_trainer(self):
        device_count = backend_device_count(torch_device)
--- a/tests/utils/test_modeling_utils.py
+++ b/tests/utils/test_modeling_utils.py
@@ -58,6 +58,7 @@ from transformers.testing_utils import (
    is_staging_test,
    require_accelerate,
    require_flax,
    require_non_hpu,
    require_read_token,
    require_safetensors,
    require_tf,
@@ -1002,6 +1003,7 @@ class ModelUtilsTest(TestCasePlus):
        self.assertIsNotNone(model)
    @require_non_hpu
    @require_accelerate
    @mark.accelerate_tests
    @require_torch_multi_accelerator
--- a/utils/print_env.py
+++ b/utils/print_env.py
@@ -21,7 +21,7 @@ import os
 import sys
 import transformers
-from transformers import is_torch_xpu_available
+from transformers import is_torch_hpu_available, is_torch_xpu_available
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
@@ -38,6 +38,9 @@ try:
        accelerator = "CUDA"
    elif is_torch_xpu_available():
        accelerator = "XPU"
    elif is_torch_hpu_available():
        accelerator = "HPU"
    print("Torch accelerator:", accelerator)
    if accelerator == "CUDA":
@@ -48,6 +51,9 @@ try:
    elif accelerator == "XPU":
        print("SYCL version:", torch.version.xpu)
        print("Number of XPUs available:", torch.xpu.device_count())
    elif accelerator == "HPU":
        print("HPU version:", torch.__version__.split("+")[-1])
        print("Number of HPUs available:", torch.hpu.device_count())
 except ImportError:
    print("Torch version:", None)