From 984ff89e7306ad33c46f76afc9aa78d40a8c01d8 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Mon, 23 Jun 2025 10:56:51 +0200 Subject: [PATCH] Gaudi3 CI (#38790) --- .github/workflows/model_jobs_intel_gaudi.yml | 121 ++++++ .../workflows/self-scheduled-intel-gaudi.yml | 345 ++++++++++++++++++ .../self-scheduled-intel-gaudi3-caller.yml | 67 ++++ src/transformers/data/processors/squad.py | 30 +- src/transformers/testing_utils.py | 3 + src/transformers/utils/import_utils.py | 22 +- tests/deepspeed/test_deepspeed.py | 8 + tests/deepspeed/test_model_zoo.py | 2 + tests/fsdp/test_fsdp.py | 2 + tests/test_modeling_common.py | 7 + tests/trainer/test_trainer.py | 4 +- tests/trainer/test_trainer_distributed.py | 7 +- .../trainer/test_trainer_distributed_loss.py | 2 + .../test_trainer_distributed_worker_seed.py | 2 + tests/utils/test_modeling_utils.py | 2 + utils/print_env.py | 8 +- 16 files changed, 618 insertions(+), 14 deletions(-) create mode 100644 .github/workflows/model_jobs_intel_gaudi.yml create mode 100644 .github/workflows/self-scheduled-intel-gaudi.yml create mode 100644 .github/workflows/self-scheduled-intel-gaudi3-caller.yml diff --git a/.github/workflows/model_jobs_intel_gaudi.yml b/.github/workflows/model_jobs_intel_gaudi.yml new file mode 100644 index 0000000000..73ff2ba269 --- /dev/null +++ b/.github/workflows/model_jobs_intel_gaudi.yml @@ -0,0 +1,121 @@ +name: model jobs + +on: + workflow_call: + inputs: + folder_slices: + required: true + type: string + slice_id: + required: true + type: number + runner: + required: true + type: string + machine_type: + required: true + type: string + report_name_prefix: + required: false + default: run_models_gpu + type: string + +env: + RUN_SLOW: yes + PT_HPU_LAZY_MODE: 0 + TRANSFORMERS_IS_CI: yes + PT_ENABLE_INT64_SUPPORT: 1 + HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} + HF_HOME: /mnt/cache/.cache/huggingface + +jobs: + run_models_gpu: + name: " " + strategy: + max-parallel: 8 + fail-fast: false + matrix: + folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }} + runs-on: + group: ${{ inputs.runner }} + container: + image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest + options: --runtime=habana + -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface + --env OMPI_MCA_btl_vader_single_copy_mechanism=none + --env HABANA_VISIBLE_DEVICES + --env HABANA_VISIBLE_MODULES + --cap-add=sys_nice + --shm-size=64G + steps: + - name: Echo input and matrix info + shell: bash + run: | + echo "${{ inputs.folder_slices }}" + echo "${{ matrix.folders }}" + echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}" + + - name: Echo folder ${{ matrix.folders }} + shell: bash + run: | + echo "${{ matrix.folders }}" + matrix_folders=${{ matrix.folders }} + matrix_folders=${matrix_folders/'models/'/'models_'} + echo "$matrix_folders" + echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Install dependencies + run: | + pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn + + - name: HL-SMI + run: | + hl-smi + echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}" + echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}" + + - name: Environment + run: python3 utils/print_env.py + + - name: Show installed libraries and their versions + run: pip freeze + + - name: Set `machine_type` for report and artifact names + shell: bash + run: | + if [ "${{ inputs.machine_type }}" = "1gaudi" ]; then + machine_type=single-gpu + elif [ "${{ inputs.machine_type }}" = "2gaudi" ]; then + machine_type=multi-gpu + else + machine_type=${{ inputs.machine_type }} + fi + echo "machine_type=$machine_type" >> $GITHUB_ENV + + - name: Run all tests on Gaudi + run: python3 -m pytest -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt + + - name: Run test + shell: bash + run: | + mkdir -p reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports + echo "hello" > reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt + echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports" + + - name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports" + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports + path: reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports diff --git a/.github/workflows/self-scheduled-intel-gaudi.yml b/.github/workflows/self-scheduled-intel-gaudi.yml new file mode 100644 index 0000000000..2db5ece064 --- /dev/null +++ b/.github/workflows/self-scheduled-intel-gaudi.yml @@ -0,0 +1,345 @@ +name: Self-hosted runner (scheduled-intel-gaudi) + +on: + workflow_call: + inputs: + job: + required: true + type: string + slack_report_channel: + required: true + type: string + runner_scale_set: + required: true + type: string + ci_event: + required: true + type: string + report_repo_id: + required: true + type: string + +env: + NUM_SLICES: 2 + RUN_SLOW: yes + PT_HPU_LAZY_MODE: 0 + TRANSFORMERS_IS_CI: yes + PT_ENABLE_INT64_SUPPORT: 1 + HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} + HF_HOME: /mnt/cache/.cache/huggingface + +jobs: + setup: + if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job) + name: Setup + runs-on: ubuntu-latest + outputs: + slice_ids: ${{ steps.set-matrix.outputs.slice_ids }} + folder_slices: ${{ steps.set-matrix.outputs.folder_slices }} + quantization_matrix: ${{ steps.set-matrix.outputs.quantization_matrix }} + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - id: set-matrix + if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job) + name: Identify models to test + working-directory: tests + run: | + if [ "${{ inputs.job }}" = "run_models_gpu" ]; then + echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT + echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT + elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then + echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT + echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT + fi + + - id: set-matrix-quantization + if: ${{ inputs.job == 'run_quantization_torch_gpu' }} + name: Identify quantization method to test + working-directory: tests + run: | + echo "quantization_matrix=$(python3 -c 'import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ; print(d)')" >> $GITHUB_OUTPUT + + run_models_gpu: + if: ${{ inputs.job == 'run_models_gpu' }} + name: " " + needs: setup + strategy: + fail-fast: false + matrix: + machine_type: [1gaudi, 2gaudi] + slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }} + uses: ./.github/workflows/model_jobs_intel_gaudi.yml + with: + slice_id: ${{ matrix.slice_id }} + machine_type: ${{ matrix.machine_type }} + folder_slices: ${{ needs.setup.outputs.folder_slices }} + runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }} + report_name_prefix: run_models_gpu + + secrets: inherit + + run_trainer_and_fsdp_gpu: + if: ${{ inputs.job == 'run_trainer_and_fsdp_gpu' }} + name: " " + needs: setup + strategy: + fail-fast: false + matrix: + machine_type: [1gaudi, 2gaudi] + slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }} + uses: ./.github/workflows/model_jobs_intel_gaudi.yml + with: + slice_id: ${{ matrix.slice_id }} + machine_type: ${{ matrix.machine_type }} + folder_slices: ${{ needs.setup.outputs.folder_slices }} + runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }} + report_name_prefix: run_trainer_and_fsdp_gpu + + secrets: inherit + + run_pipelines_gpu: + if: ${{ inputs.job == 'run_pipelines_gpu' }} + name: Pipelines + strategy: + fail-fast: false + matrix: + machine_type: [1gaudi, 2gaudi] + runs-on: + group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }} + container: + image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest + options: --runtime=habana + -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface + --env OMPI_MCA_btl_vader_single_copy_mechanism=none + --env HABANA_VISIBLE_DEVICES + --env HABANA_VISIBLE_MODULES + --cap-add=sys_nice + --shm-size=64G + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Install dependencies + run: | + pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile + + - name: HL-SMI + run: | + hl-smi + echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}" + echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}" + + - name: Environment + run: python3 utils/print_env.py + + - name: Show installed libraries and their versions + run: pip freeze + + - name: Set `machine_type` for report and artifact names + shell: bash + run: | + if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then + machine_type=single-gpu + elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then + machine_type=multi-gpu + else + machine_type=${{ matrix.machine_type }} + fi + echo "machine_type=$machine_type" >> $GITHUB_ENV + + - name: Run all pipeline tests on Intel Gaudi + run: | + python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_pipelines_gpu_test_reports tests/pipelines -m "not not_device_test" + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: | + cat reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports/failures_short.txt + + - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_gpu_test_reports" + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ env.machine_type }}_run_pipelines_gpu_test_reports + path: reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports + + run_examples_gpu: + if: ${{ inputs.job == 'run_examples_gpu' }} + name: Examples directory + strategy: + fail-fast: false + matrix: + machine_type: [1gaudi] + runs-on: + group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }} + container: + image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest + options: --runtime=habana + -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface + --env OMPI_MCA_btl_vader_single_copy_mechanism=none + --env HABANA_VISIBLE_DEVICES + --env HABANA_VISIBLE_MODULES + --cap-add=sys_nice + --shm-size=64G + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Install dependencies + run: | + pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile + + - name: HL-SMI + run: | + hl-smi + echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}" + echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}" + + - name: Environment + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + run: | + pip freeze + + - name: Set `machine_type` for report and artifact names + shell: bash + run: | + if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then + machine_type=single-gpu + elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then + machine_type=multi-gpu + else + machine_type=${{ matrix.machine_type }} + fi + echo "machine_type=$machine_type" >> $GITHUB_ENV + + - name: Run examples tests on Intel Gaudi + run: | + pip install -r examples/pytorch/_tests_requirements.txt + python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_examples_gpu_test_reports examples/pytorch -m "not not_device_test" + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: | + cat reports/${{ env.machine_type }}_run_examples_gpu_test_reports/failures_short.txt + + - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_examples_gpu_test_reports" + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ env.machine_type }}_run_examples_gpu_test_reports + path: reports/${{ env.machine_type }}_run_examples_gpu_test_reports + + run_deepspeed_gpu: + if: ${{ inputs.job == 'run_deepspeed_gpu' }} + name: Intel Gaudi deepspeed tests + strategy: + fail-fast: false + matrix: + machine_type: [1gaudi, 2gaudi] + runs-on: + group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }} + container: + image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest + options: --runtime=habana + -v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface + --env OMPI_MCA_btl_vader_single_copy_mechanism=none + --env HABANA_VISIBLE_DEVICES + --env HABANA_VISIBLE_MODULES + --cap-add=sys_nice + --shm-size=64G + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Install dependencies + run: | + pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile + pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0 + + - name: HL-SMI + run: | + hl-smi + echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}" + echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}" + + - name: Environment + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + run: | + pip freeze + + - name: Set `machine_type` for report and artifact names + shell: bash + run: | + if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then + machine_type=single-gpu + elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then + machine_type=multi-gpu + else + machine_type=${{ matrix.machine_type }} + fi + echo "machine_type=$machine_type" >> $GITHUB_ENV + + - name: Run all deepspeed tests on intel Gaudi + run: | + python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_deepspeed_gpu_test_reports tests/deepspeed -m "not not_device_test" + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: | + cat reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports/failures_short.txt + + - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports" + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports + path: reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports + + send_results: + name: Slack Report + needs: + [ + setup, + run_models_gpu, + run_examples_gpu, + run_pipelines_gpu, + run_deepspeed_gpu, + run_trainer_and_fsdp_gpu, + ] + if: ${{ always() }} + uses: ./.github/workflows/slack-report.yml + with: + job: ${{ inputs.job }} + setup_status: ${{ needs.setup.result }} + slack_report_channel: ${{ inputs.slack_report_channel }} + quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }} + folder_slices: ${{ needs.setup.outputs.folder_slices }} + report_repo_id: ${{ inputs.report_repo_id }} + ci_event: ${{ inputs.ci_event }} + + secrets: inherit diff --git a/.github/workflows/self-scheduled-intel-gaudi3-caller.yml b/.github/workflows/self-scheduled-intel-gaudi3-caller.yml new file mode 100644 index 0000000000..83cb89290d --- /dev/null +++ b/.github/workflows/self-scheduled-intel-gaudi3-caller.yml @@ -0,0 +1,67 @@ +name: Self-hosted runner (Intel Gaudi3 scheduled CI caller) + +on: + repository_dispatch: + workflow_dispatch: + schedule: + - cron: "17 2 * * *" + +jobs: + model-ci: + name: Model CI + uses: ./.github/workflows/self-scheduled-intel-gaudi.yml + with: + job: run_models_gpu + ci_event: Scheduled CI (Intel) - Gaudi3 + runner_scale_set: itac-bm-emr-gaudi3-dell + slack_report_channel: "#transformers-ci-daily-intel-gaudi3" + report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3 + + secrets: inherit + + pipeline-ci: + name: Pipeline CI + uses: ./.github/workflows/self-scheduled-intel-gaudi.yml + with: + job: run_pipelines_gpu + ci_event: Scheduled CI (Intel) - Gaudi3 + runner_scale_set: itac-bm-emr-gaudi3-dell + slack_report_channel: "#transformers-ci-daily-intel-gaudi3" + report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3 + + secrets: inherit + + example-ci: + name: Example CI + uses: ./.github/workflows/self-scheduled-intel-gaudi.yml + with: + job: run_examples_gpu + ci_event: Scheduled CI (Intel) - Gaudi3 + runner_scale_set: itac-bm-emr-gaudi3-dell + slack_report_channel: "#transformers-ci-daily-intel-gaudi3" + report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3 + + secrets: inherit + + deepspeed-ci: + name: DeepSpeed CI + uses: ./.github/workflows/self-scheduled-intel-gaudi.yml + with: + job: run_deepspeed_gpu + ci_event: Scheduled CI (Intel) - Gaudi3 + runner_scale_set: itac-bm-emr-gaudi3-dell + slack_report_channel: "#transformers-ci-daily-intel-gaudi3" + report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3 + + secrets: inherit + + trainer-fsdp-ci: + name: Trainer/FSDP CI + uses: ./.github/workflows/self-scheduled-intel-gaudi.yml + with: + job: run_trainer_and_fsdp_gpu + ci_event: Scheduled CI (Intel) - Gaudi3 + runner_scale_set: itac-bm-emr-gaudi3-dell + slack_report_channel: "#transformers-ci-daily-intel-gaudi3" + report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3 + secrets: inherit diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py index 5f3cd0fd28..4a1b44146c 100644 --- a/src/transformers/data/processors/squad.py +++ b/src/transformers/data/processors/squad.py @@ -23,7 +23,7 @@ from tqdm import tqdm from ...models.bert.tokenization_bert import whitespace_tokenize from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy -from ...utils import is_tf_available, is_torch_available, logging +from ...utils import is_tf_available, is_torch_available, is_torch_hpu_available, logging from .utils import DataProcessor @@ -361,11 +361,29 @@ def squad_convert_examples_to_features( is_training=not evaluate, ) ```""" - # Defining helper methods - features = [] - threads = min(threads, cpu_count()) - with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p: + if not is_torch_hpu_available(): + threads = min(threads, cpu_count()) + with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p: + annotate_ = partial( + squad_convert_example_to_features, + max_seq_length=max_seq_length, + doc_stride=doc_stride, + max_query_length=max_query_length, + padding_strategy=padding_strategy, + is_training=is_training, + ) + features = list( + tqdm( + p.imap(annotate_, examples, chunksize=32), + total=len(examples), + desc="convert squad examples to features", + disable=not tqdm_enabled, + ) + ) + else: + # Non-parallel version for hpu https://github.com/huggingface/transformers/pull/38790#discussion_r2156470902 + squad_convert_example_to_features_init(tokenizer_for_convert=tokenizer) annotate_ = partial( squad_convert_example_to_features, max_seq_length=max_seq_length, @@ -376,7 +394,7 @@ def squad_convert_examples_to_features( ) features = list( tqdm( - p.imap(annotate_, examples, chunksize=32), + map(annotate_, examples), total=len(examples), desc="convert squad examples to features", disable=not tqdm_enabled, diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 8f7aedb625..7c3c6ccac8 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -3007,6 +3007,9 @@ class HfDoctestModule(Module): def _device_agnostic_dispatch(device: str, dispatch_table: dict[str, Callable], *args, **kwargs): if device not in dispatch_table: + if not callable(dispatch_table["default"]): + return dispatch_table["default"] + return dispatch_table["default"](*args, **kwargs) fn = dispatch_table[device] diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index 6ae1b36d0b..a933c9638d 100644 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -815,8 +815,8 @@ def is_torch_hpu_available(): ): return False - torch_hpu_min_version = "1.5.0" - if _accelerate_available and version.parse(_accelerate_version) < version.parse(torch_hpu_min_version): + torch_hpu_min_accelerate_version = "1.5.0" + if _accelerate_available and version.parse(_accelerate_version) < version.parse(torch_hpu_min_accelerate_version): return False import torch @@ -850,6 +850,24 @@ def is_torch_hpu_available(): torch.Tensor.masked_fill_ = patched_masked_fill_ + # IlyasMoutawwakil: we patch torch.compile to use the HPU backend by default + # https://github.com/huggingface/transformers/pull/38790#discussion_r2157043944 + # This is necessary for cases where torch.compile is used as a decorator (defaulting to inductor) + # https://github.com/huggingface/transformers/blob/af6120b3eb2470b994c21421bb6eaa76576128b0/src/transformers/models/modernbert/modeling_modernbert.py#L204 + original_compile = torch.compile + + def hpu_backend_compile(*args, **kwargs): + if kwargs.get("backend", None) not in ["hpu_backend", "eager"]: + logger.warning( + f"Calling torch.compile with backend={kwargs.get('backend', None)} on a Gaudi device is not supported. " + "We will override the backend with 'hpu_backend' to avoid errors." + ) + kwargs["backend"] = "hpu_backend" + + return original_compile(*args, **kwargs) + + torch.compile = hpu_backend_compile + return True diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index f07756f731..1a4966b09e 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -1134,10 +1134,12 @@ class TestDeepSpeedWithLauncher(TestCasePlus): @parameterized.expand(params, name_func=parameterized_custom_name_func) @require_torch_multi_accelerator + @run_first def test_basic_distributed(self, stage, dtype): self.run_and_check(stage=stage, dtype=dtype, distributed=True) @require_torch_fp16 + @run_first def test_do_eval_no_train(self): # testing only zero3 since zero2 makes no sense with inference self.run_and_check( @@ -1150,6 +1152,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus): ) @parameterized.expand(params, name_func=parameterized_custom_name_func) + @run_first def test_fp32_non_distributed(self, stage, dtype): # real model needs too much GPU memory under stage2+fp32, so using tiny random model here - # therefore no quality checks, just basic completion checks are done @@ -1166,6 +1169,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus): @parameterized.expand(params, name_func=parameterized_custom_name_func) @require_torch_multi_accelerator + @run_first def test_fp32_distributed(self, stage, dtype): # real model needs too much GPU memory under stage2+fp32, so using tiny random model here - # therefore no quality checks, just basic completion checks are done @@ -1181,6 +1185,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus): ) @parameterized.expand(params, name_func=parameterized_custom_name_func) + @run_first def test_resume_train_not_from_ds_checkpoint(self, stage, dtype): # do normal training and then resume not from the deepspeed checkpoint but explicitly from # the saved model dir @@ -1207,6 +1212,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus): @parameterized.expand(["bf16", "fp16", "fp32"]) @require_torch_multi_accelerator + @run_first def test_inference(self, dtype): if dtype == "bf16" and not is_torch_bf16_available_on_device(torch_device): self.skipTest(reason="test requires bfloat16 hardware support") @@ -1361,6 +1367,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus): return output_dir @parameterized.expand(params, name_func=parameterized_custom_name_func) + @run_first def test_clm(self, stage, dtype): # this test exercises model.resize_token_embeddings() which requires param gathering outside # of forward - it's not used by `run_translation.py`, but it is in `run_clm.py` @@ -1397,6 +1404,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus): execute_subprocess_async(cmd, env=self.get_env()) @require_torch_fp16 + @run_first def test_clm_from_config_zero3_fp16(self): # this test exercises AutoModel.from_config(config) - to ensure zero.Init is called diff --git a/tests/deepspeed/test_model_zoo.py b/tests/deepspeed/test_model_zoo.py index dba3e18abb..b2c277b862 100644 --- a/tests/deepspeed/test_model_zoo.py +++ b/tests/deepspeed/test_model_zoo.py @@ -28,6 +28,7 @@ from transformers.testing_utils import ( get_tests_dir, require_deepspeed, require_torch_accelerator, + run_first, slow, torch_device, ) @@ -327,6 +328,7 @@ params = list(itertools.product(stages, task_cmds.keys())) @slow +@run_first @require_deepspeed @require_torch_accelerator class TestDeepSpeedModelZoo(TestCasePlus): diff --git a/tests/fsdp/test_fsdp.py b/tests/fsdp/test_fsdp.py index 781199747f..a932a1fbac 100644 --- a/tests/fsdp/test_fsdp.py +++ b/tests/fsdp/test_fsdp.py @@ -358,6 +358,7 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon): raise AssertionError("CPU offloading failed with FSDP!") @require_torch_multi_accelerator + @run_first @slow @require_fsdp_v2_version @require_accelerate_fsdp2 @@ -405,6 +406,7 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon): self.assertAlmostEqual(log["learning_rate"], log1["learning_rate"], delta=1e-5) @require_torch_multi_accelerator + @run_first @slow @require_fsdp @require_fsdp_v2_version diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 4e2555b57e..874323b1e9 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -84,6 +84,7 @@ from transformers.testing_utils import ( require_bitsandbytes, require_deepspeed, require_flash_attn, + require_non_hpu, require_safetensors, require_torch, require_torch_accelerator, @@ -92,6 +93,7 @@ from transformers.testing_utils import ( require_torch_multi_accelerator, require_torch_multi_gpu, require_torch_sdpa, + run_first, run_test_using_subprocess, set_config_for_less_flaky_test, set_model_for_less_flaky_test, @@ -2797,6 +2799,7 @@ class ModelTesterMixin: else: torch.testing.assert_close(base_output[0], new_output[0], rtol=1e-5, atol=1e-5) + @require_non_hpu @require_accelerate @mark.accelerate_tests @require_torch_multi_accelerator @@ -3727,6 +3730,9 @@ class ModelTesterMixin: if torch_device in ["cpu", "cuda"]: atol = atols[torch_device, enable_kernels, torch_dtype] rtol = rtols[torch_device, enable_kernels, torch_dtype] + elif torch_device == "hpu": + atol = atols["cuda", enable_kernels, torch_dtype] + rtol = rtols["cuda", enable_kernels, torch_dtype] elif torch_device == "xpu": # As of PyTorch 2.5 XPU backend supports only torch.nn.attention.SDPBackend.MATH # which is implemented on PyTorch level using aten operators and is @@ -4666,6 +4672,7 @@ class ModelTesterMixin: # Here we need to run with a subprocess as otherwise setting back the default device to the default value ("cpu") # may bring unwanted consequences on other tests. See PR #37553 + @run_first @run_test_using_subprocess @require_torch_accelerator def test_can_load_with_global_device_set(self): diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 2594edcdef..878940b937 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -3062,6 +3062,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): # the test slower. @require_torch_non_multi_accelerator @run_test_using_subprocess + @run_first @slow def test_can_resume_training_lm(self): # Check if it works for a simple language modeling example @@ -3517,7 +3518,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): ) @slow - @run_first def test_trainer_eval_mrpc(self): MODEL_ID = "google-bert/bert-base-cased-finetuned-mrpc" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) @@ -3534,7 +3534,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): self.assertLess(result["eval_loss"], 0.2) @slow - @run_first def test_trainer_eval_multiple(self): MODEL_ID = "openai-community/gpt2" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) @@ -4125,6 +4124,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params) @slow + @run_first @require_non_hpu @require_torch_multi_accelerator def test_end_to_end_example(self): diff --git a/tests/trainer/test_trainer_distributed.py b/tests/trainer/test_trainer_distributed.py index 17f7bee23b..54568caac3 100644 --- a/tests/trainer/test_trainer_distributed.py +++ b/tests/trainer/test_trainer_distributed.py @@ -22,6 +22,7 @@ from transformers.testing_utils import ( execute_subprocess_async, get_torch_dist_unique_port, require_torch_multi_accelerator, + run_first, torch_device, ) from transformers.training_args import ParallelMode @@ -116,6 +117,7 @@ if is_torch_available(): class TestTrainerDistributed(TestCasePlus): + @run_first @require_torch_multi_accelerator def test_trainer(self): distributed_args = f"""--nproc_per_node={backend_device_count(torch_device)} @@ -199,8 +201,7 @@ if __name__ == "__main__": model = RegressionModel() training_args.per_device_train_batch_size = 1 training_args.max_steps = 1 - training_args.accelerator_config = { - "dispatch_batches": False, - } + training_args.accelerator_config.dispatch_batches = False + trainer = Trainer(model, training_args, train_dataset=train_dataset) trainer.train() diff --git a/tests/trainer/test_trainer_distributed_loss.py b/tests/trainer/test_trainer_distributed_loss.py index 405763125e..93cc042fe6 100644 --- a/tests/trainer/test_trainer_distributed_loss.py +++ b/tests/trainer/test_trainer_distributed_loss.py @@ -18,11 +18,13 @@ from transformers.testing_utils import ( execute_subprocess_async, get_torch_dist_unique_port, require_torch_multi_accelerator, + run_first, torch_device, ) class TestTrainerDistributedLoss(TestCasePlus): + @run_first @require_torch_multi_accelerator def test_trainer(self): device_count = backend_device_count(torch_device) diff --git a/tests/trainer/test_trainer_distributed_worker_seed.py b/tests/trainer/test_trainer_distributed_worker_seed.py index 3fa625af74..ded0382677 100644 --- a/tests/trainer/test_trainer_distributed_worker_seed.py +++ b/tests/trainer/test_trainer_distributed_worker_seed.py @@ -18,6 +18,7 @@ from transformers.testing_utils import ( execute_subprocess_async, get_torch_dist_unique_port, require_torch_multi_accelerator, + run_first, torch_device, ) @@ -57,6 +58,7 @@ class DummyModel(nn.Module): class TestTrainerDistributedWorkerSeed(TestCasePlus): + @run_first @require_torch_multi_accelerator def test_trainer(self): device_count = backend_device_count(torch_device) diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py index 92a38baf94..903283dd4a 100644 --- a/tests/utils/test_modeling_utils.py +++ b/tests/utils/test_modeling_utils.py @@ -58,6 +58,7 @@ from transformers.testing_utils import ( is_staging_test, require_accelerate, require_flax, + require_non_hpu, require_read_token, require_safetensors, require_tf, @@ -1002,6 +1003,7 @@ class ModelUtilsTest(TestCasePlus): self.assertIsNotNone(model) + @require_non_hpu @require_accelerate @mark.accelerate_tests @require_torch_multi_accelerator diff --git a/utils/print_env.py b/utils/print_env.py index 04ea99947e..e6d54fff2c 100644 --- a/utils/print_env.py +++ b/utils/print_env.py @@ -21,7 +21,7 @@ import os import sys import transformers -from transformers import is_torch_xpu_available +from transformers import is_torch_hpu_available, is_torch_xpu_available os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" @@ -38,6 +38,9 @@ try: accelerator = "CUDA" elif is_torch_xpu_available(): accelerator = "XPU" + elif is_torch_hpu_available(): + accelerator = "HPU" + print("Torch accelerator:", accelerator) if accelerator == "CUDA": @@ -48,6 +51,9 @@ try: elif accelerator == "XPU": print("SYCL version:", torch.version.xpu) print("Number of XPUs available:", torch.xpu.device_count()) + elif accelerator == "HPU": + print("HPU version:", torch.__version__.split("+")[-1]) + print("Number of HPUs available:", torch.hpu.device_count()) except ImportError: print("Torch version:", None)