Fix many HPU failures in the CI (#39066)
* more torch.hpu patches * increase top_k because it results in flaky behavior when Tempreture, TopP and TopK are used together, which ends up killing beams early. * remove temporal fix * fix scatter operation when input and src are the same * trigger * fix and reduce * skip finding batch size as it makes the hpu go loco * fix fsdp (yay all are passing) * fix checking equal nan values * style * remove models list * order * rename to cuda_extensions * Update src/transformers/trainer.py
This commit is contained in:
committed by
GitHub
parent
bff964c429
commit
18e0cae207
35
.github/workflows/self-scheduled-intel-gaudi.yml
vendored
35
.github/workflows/self-scheduled-intel-gaudi.yml
vendored
@@ -84,8 +84,6 @@ jobs:
|
||||
machine_type: ${{ matrix.machine_type }}
|
||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||
runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
||||
report_name_prefix: run_models_gpu
|
||||
|
||||
secrets: inherit
|
||||
|
||||
run_trainer_and_fsdp_gpu:
|
||||
@@ -104,11 +102,10 @@ jobs:
|
||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||
runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
||||
report_name_prefix: run_trainer_and_fsdp_gpu
|
||||
|
||||
secrets: inherit
|
||||
|
||||
run_pipelines_gpu:
|
||||
if: ${{ inputs.job == 'run_pipelines_gpu' }}
|
||||
run_pipelines_torch_gpu:
|
||||
if: ${{ inputs.job == 'run_pipelines_torch_gpu' }}
|
||||
name: Pipelines
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -161,20 +158,20 @@ jobs:
|
||||
|
||||
- name: Run all pipeline tests on Intel Gaudi
|
||||
run: |
|
||||
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_pipelines_gpu_test_reports tests/pipelines -m "not not_device_test"
|
||||
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines -m "not not_device_test"
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: |
|
||||
cat reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports/failures_short.txt
|
||||
cat reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_gpu_test_reports"
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ env.machine_type }}_run_pipelines_gpu_test_reports
|
||||
path: reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports
|
||||
name: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
|
||||
path: reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
|
||||
|
||||
run_examples_gpu:
|
||||
if: ${{ inputs.job == 'run_examples_gpu' }}
|
||||
@@ -248,8 +245,8 @@ jobs:
|
||||
name: ${{ env.machine_type }}_run_examples_gpu_test_reports
|
||||
path: reports/${{ env.machine_type }}_run_examples_gpu_test_reports
|
||||
|
||||
run_deepspeed_gpu:
|
||||
if: ${{ inputs.job == 'run_deepspeed_gpu' }}
|
||||
run_torch_cuda_extensions_gpu:
|
||||
if: ${{ inputs.job == 'run_torch_cuda_extensions_gpu' }}
|
||||
name: Intel Gaudi deepspeed tests
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -305,20 +302,20 @@ jobs:
|
||||
|
||||
- name: Run all deepspeed tests on intel Gaudi
|
||||
run: |
|
||||
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_deepspeed_gpu_test_reports tests/deepspeed -m "not not_device_test"
|
||||
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed -m "not not_device_test"
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: |
|
||||
cat reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports/failures_short.txt
|
||||
cat reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports"
|
||||
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports
|
||||
path: reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports
|
||||
name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
|
||||
path: reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
|
||||
|
||||
send_results:
|
||||
name: Slack Report
|
||||
@@ -327,8 +324,8 @@ jobs:
|
||||
setup,
|
||||
run_models_gpu,
|
||||
run_examples_gpu,
|
||||
run_pipelines_gpu,
|
||||
run_deepspeed_gpu,
|
||||
run_torch_cuda_extensions_gpu,
|
||||
run_pipelines_torch_gpu,
|
||||
run_trainer_and_fsdp_gpu,
|
||||
]
|
||||
if: ${{ always() }}
|
||||
|
||||
@@ -23,7 +23,7 @@ jobs:
|
||||
name: Pipeline CI
|
||||
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
||||
with:
|
||||
job: run_pipelines_gpu
|
||||
job: run_pipelines_torch_gpu
|
||||
ci_event: Scheduled CI (Intel) - Gaudi3
|
||||
runner_scale_set: itac-bm-emr-gaudi3-dell
|
||||
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
||||
@@ -47,7 +47,7 @@ jobs:
|
||||
name: DeepSpeed CI
|
||||
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
||||
with:
|
||||
job: run_deepspeed_gpu
|
||||
job: run_torch_cuda_extensions_gpu
|
||||
ci_event: Scheduled CI (Intel) - Gaudi3
|
||||
runner_scale_set: itac-bm-emr-gaudi3-dell
|
||||
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
||||
|
||||
Reference in New Issue
Block a user