Gaudi3 CI (#38790)
This commit is contained in:
committed by
GitHub
parent
2166b6b4ff
commit
984ff89e73
121
.github/workflows/model_jobs_intel_gaudi.yml
vendored
Normal file
121
.github/workflows/model_jobs_intel_gaudi.yml
vendored
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
name: model jobs
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_call:
|
||||||
|
inputs:
|
||||||
|
folder_slices:
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
slice_id:
|
||||||
|
required: true
|
||||||
|
type: number
|
||||||
|
runner:
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
machine_type:
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
report_name_prefix:
|
||||||
|
required: false
|
||||||
|
default: run_models_gpu
|
||||||
|
type: string
|
||||||
|
|
||||||
|
env:
|
||||||
|
RUN_SLOW: yes
|
||||||
|
PT_HPU_LAZY_MODE: 0
|
||||||
|
TRANSFORMERS_IS_CI: yes
|
||||||
|
PT_ENABLE_INT64_SUPPORT: 1
|
||||||
|
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||||
|
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
||||||
|
HF_HOME: /mnt/cache/.cache/huggingface
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
run_models_gpu:
|
||||||
|
name: " "
|
||||||
|
strategy:
|
||||||
|
max-parallel: 8
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
|
||||||
|
runs-on:
|
||||||
|
group: ${{ inputs.runner }}
|
||||||
|
container:
|
||||||
|
image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
||||||
|
options: --runtime=habana
|
||||||
|
-v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
|
||||||
|
--env OMPI_MCA_btl_vader_single_copy_mechanism=none
|
||||||
|
--env HABANA_VISIBLE_DEVICES
|
||||||
|
--env HABANA_VISIBLE_MODULES
|
||||||
|
--cap-add=sys_nice
|
||||||
|
--shm-size=64G
|
||||||
|
steps:
|
||||||
|
- name: Echo input and matrix info
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
echo "${{ inputs.folder_slices }}"
|
||||||
|
echo "${{ matrix.folders }}"
|
||||||
|
echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
|
||||||
|
|
||||||
|
- name: Echo folder ${{ matrix.folders }}
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
echo "${{ matrix.folders }}"
|
||||||
|
matrix_folders=${{ matrix.folders }}
|
||||||
|
matrix_folders=${matrix_folders/'models/'/'models_'}
|
||||||
|
echo "$matrix_folders"
|
||||||
|
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn
|
||||||
|
|
||||||
|
- name: HL-SMI
|
||||||
|
run: |
|
||||||
|
hl-smi
|
||||||
|
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
|
||||||
|
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
|
||||||
|
|
||||||
|
- name: Environment
|
||||||
|
run: python3 utils/print_env.py
|
||||||
|
|
||||||
|
- name: Show installed libraries and their versions
|
||||||
|
run: pip freeze
|
||||||
|
|
||||||
|
- name: Set `machine_type` for report and artifact names
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
if [ "${{ inputs.machine_type }}" = "1gaudi" ]; then
|
||||||
|
machine_type=single-gpu
|
||||||
|
elif [ "${{ inputs.machine_type }}" = "2gaudi" ]; then
|
||||||
|
machine_type=multi-gpu
|
||||||
|
else
|
||||||
|
machine_type=${{ inputs.machine_type }}
|
||||||
|
fi
|
||||||
|
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Run all tests on Gaudi
|
||||||
|
run: python3 -m pytest -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
|
||||||
|
|
||||||
|
- name: Failure short reports
|
||||||
|
if: ${{ failure() }}
|
||||||
|
continue-on-error: true
|
||||||
|
run: cat reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt
|
||||||
|
|
||||||
|
- name: Run test
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
mkdir -p reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
|
||||||
|
echo "hello" > reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt
|
||||||
|
echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports"
|
||||||
|
|
||||||
|
- name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
|
||||||
|
if: ${{ always() }}
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
|
||||||
|
path: reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
|
||||||
345
.github/workflows/self-scheduled-intel-gaudi.yml
vendored
Normal file
345
.github/workflows/self-scheduled-intel-gaudi.yml
vendored
Normal file
@@ -0,0 +1,345 @@
|
|||||||
|
name: Self-hosted runner (scheduled-intel-gaudi)
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_call:
|
||||||
|
inputs:
|
||||||
|
job:
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
slack_report_channel:
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
runner_scale_set:
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
ci_event:
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
report_repo_id:
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
|
||||||
|
env:
|
||||||
|
NUM_SLICES: 2
|
||||||
|
RUN_SLOW: yes
|
||||||
|
PT_HPU_LAZY_MODE: 0
|
||||||
|
TRANSFORMERS_IS_CI: yes
|
||||||
|
PT_ENABLE_INT64_SUPPORT: 1
|
||||||
|
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||||
|
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
||||||
|
HF_HOME: /mnt/cache/.cache/huggingface
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
setup:
|
||||||
|
if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
|
||||||
|
name: Setup
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
outputs:
|
||||||
|
slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
|
||||||
|
folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
|
||||||
|
quantization_matrix: ${{ steps.set-matrix.outputs.quantization_matrix }}
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: "3.10"
|
||||||
|
|
||||||
|
- id: set-matrix
|
||||||
|
if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
|
||||||
|
name: Identify models to test
|
||||||
|
working-directory: tests
|
||||||
|
run: |
|
||||||
|
if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
|
||||||
|
echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
|
||||||
|
echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
|
||||||
|
elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
|
||||||
|
echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
|
||||||
|
echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
|
||||||
|
fi
|
||||||
|
|
||||||
|
- id: set-matrix-quantization
|
||||||
|
if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
|
||||||
|
name: Identify quantization method to test
|
||||||
|
working-directory: tests
|
||||||
|
run: |
|
||||||
|
echo "quantization_matrix=$(python3 -c 'import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ; print(d)')" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
|
run_models_gpu:
|
||||||
|
if: ${{ inputs.job == 'run_models_gpu' }}
|
||||||
|
name: " "
|
||||||
|
needs: setup
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
machine_type: [1gaudi, 2gaudi]
|
||||||
|
slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
|
||||||
|
uses: ./.github/workflows/model_jobs_intel_gaudi.yml
|
||||||
|
with:
|
||||||
|
slice_id: ${{ matrix.slice_id }}
|
||||||
|
machine_type: ${{ matrix.machine_type }}
|
||||||
|
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||||
|
runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
||||||
|
report_name_prefix: run_models_gpu
|
||||||
|
|
||||||
|
secrets: inherit
|
||||||
|
|
||||||
|
run_trainer_and_fsdp_gpu:
|
||||||
|
if: ${{ inputs.job == 'run_trainer_and_fsdp_gpu' }}
|
||||||
|
name: " "
|
||||||
|
needs: setup
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
machine_type: [1gaudi, 2gaudi]
|
||||||
|
slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
|
||||||
|
uses: ./.github/workflows/model_jobs_intel_gaudi.yml
|
||||||
|
with:
|
||||||
|
slice_id: ${{ matrix.slice_id }}
|
||||||
|
machine_type: ${{ matrix.machine_type }}
|
||||||
|
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||||
|
runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
||||||
|
report_name_prefix: run_trainer_and_fsdp_gpu
|
||||||
|
|
||||||
|
secrets: inherit
|
||||||
|
|
||||||
|
run_pipelines_gpu:
|
||||||
|
if: ${{ inputs.job == 'run_pipelines_gpu' }}
|
||||||
|
name: Pipelines
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
machine_type: [1gaudi, 2gaudi]
|
||||||
|
runs-on:
|
||||||
|
group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
||||||
|
container:
|
||||||
|
image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
||||||
|
options: --runtime=habana
|
||||||
|
-v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
|
||||||
|
--env OMPI_MCA_btl_vader_single_copy_mechanism=none
|
||||||
|
--env HABANA_VISIBLE_DEVICES
|
||||||
|
--env HABANA_VISIBLE_MODULES
|
||||||
|
--cap-add=sys_nice
|
||||||
|
--shm-size=64G
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
|
||||||
|
|
||||||
|
- name: HL-SMI
|
||||||
|
run: |
|
||||||
|
hl-smi
|
||||||
|
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
|
||||||
|
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
|
||||||
|
|
||||||
|
- name: Environment
|
||||||
|
run: python3 utils/print_env.py
|
||||||
|
|
||||||
|
- name: Show installed libraries and their versions
|
||||||
|
run: pip freeze
|
||||||
|
|
||||||
|
- name: Set `machine_type` for report and artifact names
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
|
||||||
|
machine_type=single-gpu
|
||||||
|
elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
|
||||||
|
machine_type=multi-gpu
|
||||||
|
else
|
||||||
|
machine_type=${{ matrix.machine_type }}
|
||||||
|
fi
|
||||||
|
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Run all pipeline tests on Intel Gaudi
|
||||||
|
run: |
|
||||||
|
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_pipelines_gpu_test_reports tests/pipelines -m "not not_device_test"
|
||||||
|
|
||||||
|
- name: Failure short reports
|
||||||
|
if: ${{ failure() }}
|
||||||
|
continue-on-error: true
|
||||||
|
run: |
|
||||||
|
cat reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports/failures_short.txt
|
||||||
|
|
||||||
|
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_gpu_test_reports"
|
||||||
|
if: ${{ always() }}
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: ${{ env.machine_type }}_run_pipelines_gpu_test_reports
|
||||||
|
path: reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports
|
||||||
|
|
||||||
|
run_examples_gpu:
|
||||||
|
if: ${{ inputs.job == 'run_examples_gpu' }}
|
||||||
|
name: Examples directory
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
machine_type: [1gaudi]
|
||||||
|
runs-on:
|
||||||
|
group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
||||||
|
container:
|
||||||
|
image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
||||||
|
options: --runtime=habana
|
||||||
|
-v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
|
||||||
|
--env OMPI_MCA_btl_vader_single_copy_mechanism=none
|
||||||
|
--env HABANA_VISIBLE_DEVICES
|
||||||
|
--env HABANA_VISIBLE_MODULES
|
||||||
|
--cap-add=sys_nice
|
||||||
|
--shm-size=64G
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
|
||||||
|
|
||||||
|
- name: HL-SMI
|
||||||
|
run: |
|
||||||
|
hl-smi
|
||||||
|
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
|
||||||
|
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
|
||||||
|
|
||||||
|
- name: Environment
|
||||||
|
run: |
|
||||||
|
python3 utils/print_env.py
|
||||||
|
|
||||||
|
- name: Show installed libraries and their versions
|
||||||
|
run: |
|
||||||
|
pip freeze
|
||||||
|
|
||||||
|
- name: Set `machine_type` for report and artifact names
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
|
||||||
|
machine_type=single-gpu
|
||||||
|
elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
|
||||||
|
machine_type=multi-gpu
|
||||||
|
else
|
||||||
|
machine_type=${{ matrix.machine_type }}
|
||||||
|
fi
|
||||||
|
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Run examples tests on Intel Gaudi
|
||||||
|
run: |
|
||||||
|
pip install -r examples/pytorch/_tests_requirements.txt
|
||||||
|
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_examples_gpu_test_reports examples/pytorch -m "not not_device_test"
|
||||||
|
|
||||||
|
- name: Failure short reports
|
||||||
|
if: ${{ failure() }}
|
||||||
|
continue-on-error: true
|
||||||
|
run: |
|
||||||
|
cat reports/${{ env.machine_type }}_run_examples_gpu_test_reports/failures_short.txt
|
||||||
|
|
||||||
|
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_examples_gpu_test_reports"
|
||||||
|
if: ${{ always() }}
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: ${{ env.machine_type }}_run_examples_gpu_test_reports
|
||||||
|
path: reports/${{ env.machine_type }}_run_examples_gpu_test_reports
|
||||||
|
|
||||||
|
run_deepspeed_gpu:
|
||||||
|
if: ${{ inputs.job == 'run_deepspeed_gpu' }}
|
||||||
|
name: Intel Gaudi deepspeed tests
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
machine_type: [1gaudi, 2gaudi]
|
||||||
|
runs-on:
|
||||||
|
group: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
|
||||||
|
container:
|
||||||
|
image: vault.habana.ai/gaudi-docker/1.21.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
|
||||||
|
options: --runtime=habana
|
||||||
|
-v /mnt/cache/.cache/huggingface:/mnt/cache/.cache/huggingface
|
||||||
|
--env OMPI_MCA_btl_vader_single_copy_mechanism=none
|
||||||
|
--env HABANA_VISIBLE_DEVICES
|
||||||
|
--env HABANA_VISIBLE_MODULES
|
||||||
|
--cap-add=sys_nice
|
||||||
|
--shm-size=64G
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
pip install -e .[testing,torch] "numpy<2.0.0" scipy scikit-learn librosa soundfile
|
||||||
|
pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0
|
||||||
|
|
||||||
|
- name: HL-SMI
|
||||||
|
run: |
|
||||||
|
hl-smi
|
||||||
|
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
|
||||||
|
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
|
||||||
|
|
||||||
|
- name: Environment
|
||||||
|
run: |
|
||||||
|
python3 utils/print_env.py
|
||||||
|
|
||||||
|
- name: Show installed libraries and their versions
|
||||||
|
run: |
|
||||||
|
pip freeze
|
||||||
|
|
||||||
|
- name: Set `machine_type` for report and artifact names
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
if [ "${{ matrix.machine_type }}" = "1gaudi" ]; then
|
||||||
|
machine_type=single-gpu
|
||||||
|
elif [ "${{ matrix.machine_type }}" = "2gaudi" ]; then
|
||||||
|
machine_type=multi-gpu
|
||||||
|
else
|
||||||
|
machine_type=${{ matrix.machine_type }}
|
||||||
|
fi
|
||||||
|
echo "machine_type=$machine_type" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Run all deepspeed tests on intel Gaudi
|
||||||
|
run: |
|
||||||
|
python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_deepspeed_gpu_test_reports tests/deepspeed -m "not not_device_test"
|
||||||
|
|
||||||
|
- name: Failure short reports
|
||||||
|
if: ${{ failure() }}
|
||||||
|
continue-on-error: true
|
||||||
|
run: |
|
||||||
|
cat reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports/failures_short.txt
|
||||||
|
|
||||||
|
- name: "Test suite reports artifacts: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports"
|
||||||
|
if: ${{ always() }}
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports
|
||||||
|
path: reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports
|
||||||
|
|
||||||
|
send_results:
|
||||||
|
name: Slack Report
|
||||||
|
needs:
|
||||||
|
[
|
||||||
|
setup,
|
||||||
|
run_models_gpu,
|
||||||
|
run_examples_gpu,
|
||||||
|
run_pipelines_gpu,
|
||||||
|
run_deepspeed_gpu,
|
||||||
|
run_trainer_and_fsdp_gpu,
|
||||||
|
]
|
||||||
|
if: ${{ always() }}
|
||||||
|
uses: ./.github/workflows/slack-report.yml
|
||||||
|
with:
|
||||||
|
job: ${{ inputs.job }}
|
||||||
|
setup_status: ${{ needs.setup.result }}
|
||||||
|
slack_report_channel: ${{ inputs.slack_report_channel }}
|
||||||
|
quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
|
||||||
|
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||||
|
report_repo_id: ${{ inputs.report_repo_id }}
|
||||||
|
ci_event: ${{ inputs.ci_event }}
|
||||||
|
|
||||||
|
secrets: inherit
|
||||||
67
.github/workflows/self-scheduled-intel-gaudi3-caller.yml
vendored
Normal file
67
.github/workflows/self-scheduled-intel-gaudi3-caller.yml
vendored
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
name: Self-hosted runner (Intel Gaudi3 scheduled CI caller)
|
||||||
|
|
||||||
|
on:
|
||||||
|
repository_dispatch:
|
||||||
|
workflow_dispatch:
|
||||||
|
schedule:
|
||||||
|
- cron: "17 2 * * *"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
model-ci:
|
||||||
|
name: Model CI
|
||||||
|
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
||||||
|
with:
|
||||||
|
job: run_models_gpu
|
||||||
|
ci_event: Scheduled CI (Intel) - Gaudi3
|
||||||
|
runner_scale_set: itac-bm-emr-gaudi3-dell
|
||||||
|
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
||||||
|
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
|
||||||
|
|
||||||
|
secrets: inherit
|
||||||
|
|
||||||
|
pipeline-ci:
|
||||||
|
name: Pipeline CI
|
||||||
|
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
||||||
|
with:
|
||||||
|
job: run_pipelines_gpu
|
||||||
|
ci_event: Scheduled CI (Intel) - Gaudi3
|
||||||
|
runner_scale_set: itac-bm-emr-gaudi3-dell
|
||||||
|
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
||||||
|
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
|
||||||
|
|
||||||
|
secrets: inherit
|
||||||
|
|
||||||
|
example-ci:
|
||||||
|
name: Example CI
|
||||||
|
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
||||||
|
with:
|
||||||
|
job: run_examples_gpu
|
||||||
|
ci_event: Scheduled CI (Intel) - Gaudi3
|
||||||
|
runner_scale_set: itac-bm-emr-gaudi3-dell
|
||||||
|
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
||||||
|
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
|
||||||
|
|
||||||
|
secrets: inherit
|
||||||
|
|
||||||
|
deepspeed-ci:
|
||||||
|
name: DeepSpeed CI
|
||||||
|
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
||||||
|
with:
|
||||||
|
job: run_deepspeed_gpu
|
||||||
|
ci_event: Scheduled CI (Intel) - Gaudi3
|
||||||
|
runner_scale_set: itac-bm-emr-gaudi3-dell
|
||||||
|
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
||||||
|
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
|
||||||
|
|
||||||
|
secrets: inherit
|
||||||
|
|
||||||
|
trainer-fsdp-ci:
|
||||||
|
name: Trainer/FSDP CI
|
||||||
|
uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
|
||||||
|
with:
|
||||||
|
job: run_trainer_and_fsdp_gpu
|
||||||
|
ci_event: Scheduled CI (Intel) - Gaudi3
|
||||||
|
runner_scale_set: itac-bm-emr-gaudi3-dell
|
||||||
|
slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
|
||||||
|
report_repo_id: optimum-intel/transformers_daily_ci_intel_gaudi3
|
||||||
|
secrets: inherit
|
||||||
@@ -23,7 +23,7 @@ from tqdm import tqdm
|
|||||||
|
|
||||||
from ...models.bert.tokenization_bert import whitespace_tokenize
|
from ...models.bert.tokenization_bert import whitespace_tokenize
|
||||||
from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
|
from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
|
||||||
from ...utils import is_tf_available, is_torch_available, logging
|
from ...utils import is_tf_available, is_torch_available, is_torch_hpu_available, logging
|
||||||
from .utils import DataProcessor
|
from .utils import DataProcessor
|
||||||
|
|
||||||
|
|
||||||
@@ -361,11 +361,29 @@ def squad_convert_examples_to_features(
|
|||||||
is_training=not evaluate,
|
is_training=not evaluate,
|
||||||
)
|
)
|
||||||
```"""
|
```"""
|
||||||
# Defining helper methods
|
|
||||||
features = []
|
|
||||||
|
|
||||||
threads = min(threads, cpu_count())
|
if not is_torch_hpu_available():
|
||||||
with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
|
threads = min(threads, cpu_count())
|
||||||
|
with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
|
||||||
|
annotate_ = partial(
|
||||||
|
squad_convert_example_to_features,
|
||||||
|
max_seq_length=max_seq_length,
|
||||||
|
doc_stride=doc_stride,
|
||||||
|
max_query_length=max_query_length,
|
||||||
|
padding_strategy=padding_strategy,
|
||||||
|
is_training=is_training,
|
||||||
|
)
|
||||||
|
features = list(
|
||||||
|
tqdm(
|
||||||
|
p.imap(annotate_, examples, chunksize=32),
|
||||||
|
total=len(examples),
|
||||||
|
desc="convert squad examples to features",
|
||||||
|
disable=not tqdm_enabled,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Non-parallel version for hpu https://github.com/huggingface/transformers/pull/38790#discussion_r2156470902
|
||||||
|
squad_convert_example_to_features_init(tokenizer_for_convert=tokenizer)
|
||||||
annotate_ = partial(
|
annotate_ = partial(
|
||||||
squad_convert_example_to_features,
|
squad_convert_example_to_features,
|
||||||
max_seq_length=max_seq_length,
|
max_seq_length=max_seq_length,
|
||||||
@@ -376,7 +394,7 @@ def squad_convert_examples_to_features(
|
|||||||
)
|
)
|
||||||
features = list(
|
features = list(
|
||||||
tqdm(
|
tqdm(
|
||||||
p.imap(annotate_, examples, chunksize=32),
|
map(annotate_, examples),
|
||||||
total=len(examples),
|
total=len(examples),
|
||||||
desc="convert squad examples to features",
|
desc="convert squad examples to features",
|
||||||
disable=not tqdm_enabled,
|
disable=not tqdm_enabled,
|
||||||
|
|||||||
@@ -3007,6 +3007,9 @@ class HfDoctestModule(Module):
|
|||||||
|
|
||||||
def _device_agnostic_dispatch(device: str, dispatch_table: dict[str, Callable], *args, **kwargs):
|
def _device_agnostic_dispatch(device: str, dispatch_table: dict[str, Callable], *args, **kwargs):
|
||||||
if device not in dispatch_table:
|
if device not in dispatch_table:
|
||||||
|
if not callable(dispatch_table["default"]):
|
||||||
|
return dispatch_table["default"]
|
||||||
|
|
||||||
return dispatch_table["default"](*args, **kwargs)
|
return dispatch_table["default"](*args, **kwargs)
|
||||||
|
|
||||||
fn = dispatch_table[device]
|
fn = dispatch_table[device]
|
||||||
|
|||||||
@@ -815,8 +815,8 @@ def is_torch_hpu_available():
|
|||||||
):
|
):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
torch_hpu_min_version = "1.5.0"
|
torch_hpu_min_accelerate_version = "1.5.0"
|
||||||
if _accelerate_available and version.parse(_accelerate_version) < version.parse(torch_hpu_min_version):
|
if _accelerate_available and version.parse(_accelerate_version) < version.parse(torch_hpu_min_accelerate_version):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
@@ -850,6 +850,24 @@ def is_torch_hpu_available():
|
|||||||
|
|
||||||
torch.Tensor.masked_fill_ = patched_masked_fill_
|
torch.Tensor.masked_fill_ = patched_masked_fill_
|
||||||
|
|
||||||
|
# IlyasMoutawwakil: we patch torch.compile to use the HPU backend by default
|
||||||
|
# https://github.com/huggingface/transformers/pull/38790#discussion_r2157043944
|
||||||
|
# This is necessary for cases where torch.compile is used as a decorator (defaulting to inductor)
|
||||||
|
# https://github.com/huggingface/transformers/blob/af6120b3eb2470b994c21421bb6eaa76576128b0/src/transformers/models/modernbert/modeling_modernbert.py#L204
|
||||||
|
original_compile = torch.compile
|
||||||
|
|
||||||
|
def hpu_backend_compile(*args, **kwargs):
|
||||||
|
if kwargs.get("backend", None) not in ["hpu_backend", "eager"]:
|
||||||
|
logger.warning(
|
||||||
|
f"Calling torch.compile with backend={kwargs.get('backend', None)} on a Gaudi device is not supported. "
|
||||||
|
"We will override the backend with 'hpu_backend' to avoid errors."
|
||||||
|
)
|
||||||
|
kwargs["backend"] = "hpu_backend"
|
||||||
|
|
||||||
|
return original_compile(*args, **kwargs)
|
||||||
|
|
||||||
|
torch.compile = hpu_backend_compile
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1134,10 +1134,12 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
|||||||
|
|
||||||
@parameterized.expand(params, name_func=parameterized_custom_name_func)
|
@parameterized.expand(params, name_func=parameterized_custom_name_func)
|
||||||
@require_torch_multi_accelerator
|
@require_torch_multi_accelerator
|
||||||
|
@run_first
|
||||||
def test_basic_distributed(self, stage, dtype):
|
def test_basic_distributed(self, stage, dtype):
|
||||||
self.run_and_check(stage=stage, dtype=dtype, distributed=True)
|
self.run_and_check(stage=stage, dtype=dtype, distributed=True)
|
||||||
|
|
||||||
@require_torch_fp16
|
@require_torch_fp16
|
||||||
|
@run_first
|
||||||
def test_do_eval_no_train(self):
|
def test_do_eval_no_train(self):
|
||||||
# testing only zero3 since zero2 makes no sense with inference
|
# testing only zero3 since zero2 makes no sense with inference
|
||||||
self.run_and_check(
|
self.run_and_check(
|
||||||
@@ -1150,6 +1152,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
|||||||
)
|
)
|
||||||
|
|
||||||
@parameterized.expand(params, name_func=parameterized_custom_name_func)
|
@parameterized.expand(params, name_func=parameterized_custom_name_func)
|
||||||
|
@run_first
|
||||||
def test_fp32_non_distributed(self, stage, dtype):
|
def test_fp32_non_distributed(self, stage, dtype):
|
||||||
# real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
|
# real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
|
||||||
# therefore no quality checks, just basic completion checks are done
|
# therefore no quality checks, just basic completion checks are done
|
||||||
@@ -1166,6 +1169,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
|||||||
|
|
||||||
@parameterized.expand(params, name_func=parameterized_custom_name_func)
|
@parameterized.expand(params, name_func=parameterized_custom_name_func)
|
||||||
@require_torch_multi_accelerator
|
@require_torch_multi_accelerator
|
||||||
|
@run_first
|
||||||
def test_fp32_distributed(self, stage, dtype):
|
def test_fp32_distributed(self, stage, dtype):
|
||||||
# real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
|
# real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
|
||||||
# therefore no quality checks, just basic completion checks are done
|
# therefore no quality checks, just basic completion checks are done
|
||||||
@@ -1181,6 +1185,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
|||||||
)
|
)
|
||||||
|
|
||||||
@parameterized.expand(params, name_func=parameterized_custom_name_func)
|
@parameterized.expand(params, name_func=parameterized_custom_name_func)
|
||||||
|
@run_first
|
||||||
def test_resume_train_not_from_ds_checkpoint(self, stage, dtype):
|
def test_resume_train_not_from_ds_checkpoint(self, stage, dtype):
|
||||||
# do normal training and then resume not from the deepspeed checkpoint but explicitly from
|
# do normal training and then resume not from the deepspeed checkpoint but explicitly from
|
||||||
# the saved model dir
|
# the saved model dir
|
||||||
@@ -1207,6 +1212,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
|||||||
|
|
||||||
@parameterized.expand(["bf16", "fp16", "fp32"])
|
@parameterized.expand(["bf16", "fp16", "fp32"])
|
||||||
@require_torch_multi_accelerator
|
@require_torch_multi_accelerator
|
||||||
|
@run_first
|
||||||
def test_inference(self, dtype):
|
def test_inference(self, dtype):
|
||||||
if dtype == "bf16" and not is_torch_bf16_available_on_device(torch_device):
|
if dtype == "bf16" and not is_torch_bf16_available_on_device(torch_device):
|
||||||
self.skipTest(reason="test requires bfloat16 hardware support")
|
self.skipTest(reason="test requires bfloat16 hardware support")
|
||||||
@@ -1361,6 +1367,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
|||||||
return output_dir
|
return output_dir
|
||||||
|
|
||||||
@parameterized.expand(params, name_func=parameterized_custom_name_func)
|
@parameterized.expand(params, name_func=parameterized_custom_name_func)
|
||||||
|
@run_first
|
||||||
def test_clm(self, stage, dtype):
|
def test_clm(self, stage, dtype):
|
||||||
# this test exercises model.resize_token_embeddings() which requires param gathering outside
|
# this test exercises model.resize_token_embeddings() which requires param gathering outside
|
||||||
# of forward - it's not used by `run_translation.py`, but it is in `run_clm.py`
|
# of forward - it's not used by `run_translation.py`, but it is in `run_clm.py`
|
||||||
@@ -1397,6 +1404,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
|||||||
execute_subprocess_async(cmd, env=self.get_env())
|
execute_subprocess_async(cmd, env=self.get_env())
|
||||||
|
|
||||||
@require_torch_fp16
|
@require_torch_fp16
|
||||||
|
@run_first
|
||||||
def test_clm_from_config_zero3_fp16(self):
|
def test_clm_from_config_zero3_fp16(self):
|
||||||
# this test exercises AutoModel.from_config(config) - to ensure zero.Init is called
|
# this test exercises AutoModel.from_config(config) - to ensure zero.Init is called
|
||||||
|
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ from transformers.testing_utils import (
|
|||||||
get_tests_dir,
|
get_tests_dir,
|
||||||
require_deepspeed,
|
require_deepspeed,
|
||||||
require_torch_accelerator,
|
require_torch_accelerator,
|
||||||
|
run_first,
|
||||||
slow,
|
slow,
|
||||||
torch_device,
|
torch_device,
|
||||||
)
|
)
|
||||||
@@ -327,6 +328,7 @@ params = list(itertools.product(stages, task_cmds.keys()))
|
|||||||
|
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
|
@run_first
|
||||||
@require_deepspeed
|
@require_deepspeed
|
||||||
@require_torch_accelerator
|
@require_torch_accelerator
|
||||||
class TestDeepSpeedModelZoo(TestCasePlus):
|
class TestDeepSpeedModelZoo(TestCasePlus):
|
||||||
|
|||||||
@@ -358,6 +358,7 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
raise AssertionError("CPU offloading failed with FSDP!")
|
raise AssertionError("CPU offloading failed with FSDP!")
|
||||||
|
|
||||||
@require_torch_multi_accelerator
|
@require_torch_multi_accelerator
|
||||||
|
@run_first
|
||||||
@slow
|
@slow
|
||||||
@require_fsdp_v2_version
|
@require_fsdp_v2_version
|
||||||
@require_accelerate_fsdp2
|
@require_accelerate_fsdp2
|
||||||
@@ -405,6 +406,7 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
self.assertAlmostEqual(log["learning_rate"], log1["learning_rate"], delta=1e-5)
|
self.assertAlmostEqual(log["learning_rate"], log1["learning_rate"], delta=1e-5)
|
||||||
|
|
||||||
@require_torch_multi_accelerator
|
@require_torch_multi_accelerator
|
||||||
|
@run_first
|
||||||
@slow
|
@slow
|
||||||
@require_fsdp
|
@require_fsdp
|
||||||
@require_fsdp_v2_version
|
@require_fsdp_v2_version
|
||||||
|
|||||||
@@ -84,6 +84,7 @@ from transformers.testing_utils import (
|
|||||||
require_bitsandbytes,
|
require_bitsandbytes,
|
||||||
require_deepspeed,
|
require_deepspeed,
|
||||||
require_flash_attn,
|
require_flash_attn,
|
||||||
|
require_non_hpu,
|
||||||
require_safetensors,
|
require_safetensors,
|
||||||
require_torch,
|
require_torch,
|
||||||
require_torch_accelerator,
|
require_torch_accelerator,
|
||||||
@@ -92,6 +93,7 @@ from transformers.testing_utils import (
|
|||||||
require_torch_multi_accelerator,
|
require_torch_multi_accelerator,
|
||||||
require_torch_multi_gpu,
|
require_torch_multi_gpu,
|
||||||
require_torch_sdpa,
|
require_torch_sdpa,
|
||||||
|
run_first,
|
||||||
run_test_using_subprocess,
|
run_test_using_subprocess,
|
||||||
set_config_for_less_flaky_test,
|
set_config_for_less_flaky_test,
|
||||||
set_model_for_less_flaky_test,
|
set_model_for_less_flaky_test,
|
||||||
@@ -2797,6 +2799,7 @@ class ModelTesterMixin:
|
|||||||
else:
|
else:
|
||||||
torch.testing.assert_close(base_output[0], new_output[0], rtol=1e-5, atol=1e-5)
|
torch.testing.assert_close(base_output[0], new_output[0], rtol=1e-5, atol=1e-5)
|
||||||
|
|
||||||
|
@require_non_hpu
|
||||||
@require_accelerate
|
@require_accelerate
|
||||||
@mark.accelerate_tests
|
@mark.accelerate_tests
|
||||||
@require_torch_multi_accelerator
|
@require_torch_multi_accelerator
|
||||||
@@ -3727,6 +3730,9 @@ class ModelTesterMixin:
|
|||||||
if torch_device in ["cpu", "cuda"]:
|
if torch_device in ["cpu", "cuda"]:
|
||||||
atol = atols[torch_device, enable_kernels, torch_dtype]
|
atol = atols[torch_device, enable_kernels, torch_dtype]
|
||||||
rtol = rtols[torch_device, enable_kernels, torch_dtype]
|
rtol = rtols[torch_device, enable_kernels, torch_dtype]
|
||||||
|
elif torch_device == "hpu":
|
||||||
|
atol = atols["cuda", enable_kernels, torch_dtype]
|
||||||
|
rtol = rtols["cuda", enable_kernels, torch_dtype]
|
||||||
elif torch_device == "xpu":
|
elif torch_device == "xpu":
|
||||||
# As of PyTorch 2.5 XPU backend supports only torch.nn.attention.SDPBackend.MATH
|
# As of PyTorch 2.5 XPU backend supports only torch.nn.attention.SDPBackend.MATH
|
||||||
# which is implemented on PyTorch level using aten operators and is
|
# which is implemented on PyTorch level using aten operators and is
|
||||||
@@ -4666,6 +4672,7 @@ class ModelTesterMixin:
|
|||||||
|
|
||||||
# Here we need to run with a subprocess as otherwise setting back the default device to the default value ("cpu")
|
# Here we need to run with a subprocess as otherwise setting back the default device to the default value ("cpu")
|
||||||
# may bring unwanted consequences on other tests. See PR #37553
|
# may bring unwanted consequences on other tests. See PR #37553
|
||||||
|
@run_first
|
||||||
@run_test_using_subprocess
|
@run_test_using_subprocess
|
||||||
@require_torch_accelerator
|
@require_torch_accelerator
|
||||||
def test_can_load_with_global_device_set(self):
|
def test_can_load_with_global_device_set(self):
|
||||||
|
|||||||
@@ -3062,6 +3062,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
# the test slower.
|
# the test slower.
|
||||||
@require_torch_non_multi_accelerator
|
@require_torch_non_multi_accelerator
|
||||||
@run_test_using_subprocess
|
@run_test_using_subprocess
|
||||||
|
@run_first
|
||||||
@slow
|
@slow
|
||||||
def test_can_resume_training_lm(self):
|
def test_can_resume_training_lm(self):
|
||||||
# Check if it works for a simple language modeling example
|
# Check if it works for a simple language modeling example
|
||||||
@@ -3517,7 +3518,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
)
|
)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
@run_first
|
|
||||||
def test_trainer_eval_mrpc(self):
|
def test_trainer_eval_mrpc(self):
|
||||||
MODEL_ID = "google-bert/bert-base-cased-finetuned-mrpc"
|
MODEL_ID = "google-bert/bert-base-cased-finetuned-mrpc"
|
||||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||||
@@ -3534,7 +3534,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
self.assertLess(result["eval_loss"], 0.2)
|
self.assertLess(result["eval_loss"], 0.2)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
@run_first
|
|
||||||
def test_trainer_eval_multiple(self):
|
def test_trainer_eval_multiple(self):
|
||||||
MODEL_ID = "openai-community/gpt2"
|
MODEL_ID = "openai-community/gpt2"
|
||||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
||||||
@@ -4125,6 +4124,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params)
|
self.assertListEqual(trainer.optimizer.param_groups[1]["params"], no_wd_params)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
|
@run_first
|
||||||
@require_non_hpu
|
@require_non_hpu
|
||||||
@require_torch_multi_accelerator
|
@require_torch_multi_accelerator
|
||||||
def test_end_to_end_example(self):
|
def test_end_to_end_example(self):
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ from transformers.testing_utils import (
|
|||||||
execute_subprocess_async,
|
execute_subprocess_async,
|
||||||
get_torch_dist_unique_port,
|
get_torch_dist_unique_port,
|
||||||
require_torch_multi_accelerator,
|
require_torch_multi_accelerator,
|
||||||
|
run_first,
|
||||||
torch_device,
|
torch_device,
|
||||||
)
|
)
|
||||||
from transformers.training_args import ParallelMode
|
from transformers.training_args import ParallelMode
|
||||||
@@ -116,6 +117,7 @@ if is_torch_available():
|
|||||||
|
|
||||||
|
|
||||||
class TestTrainerDistributed(TestCasePlus):
|
class TestTrainerDistributed(TestCasePlus):
|
||||||
|
@run_first
|
||||||
@require_torch_multi_accelerator
|
@require_torch_multi_accelerator
|
||||||
def test_trainer(self):
|
def test_trainer(self):
|
||||||
distributed_args = f"""--nproc_per_node={backend_device_count(torch_device)}
|
distributed_args = f"""--nproc_per_node={backend_device_count(torch_device)}
|
||||||
@@ -199,8 +201,7 @@ if __name__ == "__main__":
|
|||||||
model = RegressionModel()
|
model = RegressionModel()
|
||||||
training_args.per_device_train_batch_size = 1
|
training_args.per_device_train_batch_size = 1
|
||||||
training_args.max_steps = 1
|
training_args.max_steps = 1
|
||||||
training_args.accelerator_config = {
|
training_args.accelerator_config.dispatch_batches = False
|
||||||
"dispatch_batches": False,
|
|
||||||
}
|
|
||||||
trainer = Trainer(model, training_args, train_dataset=train_dataset)
|
trainer = Trainer(model, training_args, train_dataset=train_dataset)
|
||||||
trainer.train()
|
trainer.train()
|
||||||
|
|||||||
@@ -18,11 +18,13 @@ from transformers.testing_utils import (
|
|||||||
execute_subprocess_async,
|
execute_subprocess_async,
|
||||||
get_torch_dist_unique_port,
|
get_torch_dist_unique_port,
|
||||||
require_torch_multi_accelerator,
|
require_torch_multi_accelerator,
|
||||||
|
run_first,
|
||||||
torch_device,
|
torch_device,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class TestTrainerDistributedLoss(TestCasePlus):
|
class TestTrainerDistributedLoss(TestCasePlus):
|
||||||
|
@run_first
|
||||||
@require_torch_multi_accelerator
|
@require_torch_multi_accelerator
|
||||||
def test_trainer(self):
|
def test_trainer(self):
|
||||||
device_count = backend_device_count(torch_device)
|
device_count = backend_device_count(torch_device)
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ from transformers.testing_utils import (
|
|||||||
execute_subprocess_async,
|
execute_subprocess_async,
|
||||||
get_torch_dist_unique_port,
|
get_torch_dist_unique_port,
|
||||||
require_torch_multi_accelerator,
|
require_torch_multi_accelerator,
|
||||||
|
run_first,
|
||||||
torch_device,
|
torch_device,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -57,6 +58,7 @@ class DummyModel(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class TestTrainerDistributedWorkerSeed(TestCasePlus):
|
class TestTrainerDistributedWorkerSeed(TestCasePlus):
|
||||||
|
@run_first
|
||||||
@require_torch_multi_accelerator
|
@require_torch_multi_accelerator
|
||||||
def test_trainer(self):
|
def test_trainer(self):
|
||||||
device_count = backend_device_count(torch_device)
|
device_count = backend_device_count(torch_device)
|
||||||
|
|||||||
@@ -58,6 +58,7 @@ from transformers.testing_utils import (
|
|||||||
is_staging_test,
|
is_staging_test,
|
||||||
require_accelerate,
|
require_accelerate,
|
||||||
require_flax,
|
require_flax,
|
||||||
|
require_non_hpu,
|
||||||
require_read_token,
|
require_read_token,
|
||||||
require_safetensors,
|
require_safetensors,
|
||||||
require_tf,
|
require_tf,
|
||||||
@@ -1002,6 +1003,7 @@ class ModelUtilsTest(TestCasePlus):
|
|||||||
|
|
||||||
self.assertIsNotNone(model)
|
self.assertIsNotNone(model)
|
||||||
|
|
||||||
|
@require_non_hpu
|
||||||
@require_accelerate
|
@require_accelerate
|
||||||
@mark.accelerate_tests
|
@mark.accelerate_tests
|
||||||
@require_torch_multi_accelerator
|
@require_torch_multi_accelerator
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
|
|
||||||
import transformers
|
import transformers
|
||||||
from transformers import is_torch_xpu_available
|
from transformers import is_torch_hpu_available, is_torch_xpu_available
|
||||||
|
|
||||||
|
|
||||||
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
||||||
@@ -38,6 +38,9 @@ try:
|
|||||||
accelerator = "CUDA"
|
accelerator = "CUDA"
|
||||||
elif is_torch_xpu_available():
|
elif is_torch_xpu_available():
|
||||||
accelerator = "XPU"
|
accelerator = "XPU"
|
||||||
|
elif is_torch_hpu_available():
|
||||||
|
accelerator = "HPU"
|
||||||
|
|
||||||
print("Torch accelerator:", accelerator)
|
print("Torch accelerator:", accelerator)
|
||||||
|
|
||||||
if accelerator == "CUDA":
|
if accelerator == "CUDA":
|
||||||
@@ -48,6 +51,9 @@ try:
|
|||||||
elif accelerator == "XPU":
|
elif accelerator == "XPU":
|
||||||
print("SYCL version:", torch.version.xpu)
|
print("SYCL version:", torch.version.xpu)
|
||||||
print("Number of XPUs available:", torch.xpu.device_count())
|
print("Number of XPUs available:", torch.xpu.device_count())
|
||||||
|
elif accelerator == "HPU":
|
||||||
|
print("HPU version:", torch.__version__.split("+")[-1])
|
||||||
|
print("Number of HPUs available:", torch.hpu.device_count())
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("Torch version:", None)
|
print("Torch version:", None)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user