Fix many HPU failures in the CI (#39066)

* more torch.hpu patches

* increase top_k because it results in flaky behavior when Tempreture, TopP and TopK are used together, which ends up killing beams early.

* remove temporal fix

* fix scatter operation when input and src are the same

* trigger

* fix and reduce

* skip finding batch size as it makes the hpu go loco

* fix fsdp (yay all are passing)

* fix checking equal nan values

* style

* remove models list

* order

* rename to cuda_extensions

* Update src/transformers/trainer.py
This commit is contained in:
Ilyas Moutawwakil
2025-07-03 11:17:27 +02:00
committed by GitHub
parent bff964c429
commit 18e0cae207
5 changed files with 71 additions and 54 deletions

View File

@@ -84,8 +84,6 @@ jobs:
machine_type: ${{ matrix.machine_type }}
folder_slices: ${{ needs.setup.outputs.folder_slices }}
runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
report_name_prefix: run_models_gpu
secrets: inherit
run_trainer_and_fsdp_gpu:
@@ -104,11 +102,10 @@ jobs:
folder_slices: ${{ needs.setup.outputs.folder_slices }}
runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
report_name_prefix: run_trainer_and_fsdp_gpu
secrets: inherit
run_pipelines_gpu:
if: ${{ inputs.job == 'run_pipelines_gpu' }}
run_pipelines_torch_gpu:
if: ${{ inputs.job == 'run_pipelines_torch_gpu' }}
name: Pipelines
strategy:
fail-fast: false
@@ -161,20 +158,20 @@ jobs:
- name: Run all pipeline tests on Intel Gaudi
run: |
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_pipelines_gpu_test_reports tests/pipelines -m "not not_device_test"
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines -m "not not_device_test"
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: |
cat reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports/failures_short.txt
cat reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_gpu_test_reports"
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ env.machine_type }}_run_pipelines_gpu_test_reports
path: reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports
name: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
path: reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
run_examples_gpu:
if: ${{ inputs.job == 'run_examples_gpu' }}
@@ -248,8 +245,8 @@ jobs:
name: ${{ env.machine_type }}_run_examples_gpu_test_reports
path: reports/${{ env.machine_type }}_run_examples_gpu_test_reports
run_deepspeed_gpu:
if: ${{ inputs.job == 'run_deepspeed_gpu' }}
run_torch_cuda_extensions_gpu:
if: ${{ inputs.job == 'run_torch_cuda_extensions_gpu' }}
name: Intel Gaudi deepspeed tests
strategy:
fail-fast: false
@@ -305,20 +302,20 @@ jobs:
- name: Run all deepspeed tests on intel Gaudi
run: |
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_deepspeed_gpu_test_reports tests/deepspeed -m "not not_device_test"
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed -m "not not_device_test"
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: |
cat reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports/failures_short.txt
cat reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports"
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports
path: reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports
name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
path: reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
send_results:
name: Slack Report
@@ -327,8 +324,8 @@ jobs:
setup,
run_models_gpu,
run_examples_gpu,
run_pipelines_gpu,
run_deepspeed_gpu,
run_torch_cuda_extensions_gpu,
run_pipelines_torch_gpu,
run_trainer_and_fsdp_gpu,
]
if: ${{ always() }}

View File

@@ -23,7 +23,7 @@ jobs:
name: Pipeline CI
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
with:
job: run_pipelines_gpu
job: run_pipelines_torch_gpu
ci_event: Scheduled CI (Intel) - Gaudi3
runner_scale_set: itac-bm-emr-gaudi3-dell
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
@@ -47,7 +47,7 @@ jobs:
name: DeepSpeed CI
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
with:
job: run_deepspeed_gpu
job: run_torch_cuda_extensions_gpu
ci_event: Scheduled CI (Intel) - Gaudi3
runner_scale_set: itac-bm-emr-gaudi3-dell
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"