Split daily CI using 2 level matrix (#28773)
* update / add new workflow files * Add comment * Use env.NUM_SLICES * use scripts * use scripts * use scripts * Fix * using one script * Fix * remove unused file * update * fail-fast: false * remove unused file * fix * fix * use matrix * inputs * style * update * fix * fix * no model name * add doc * allow args * style * pass argument --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
144
.github/workflows/self-scheduled.yml
vendored
144
.github/workflows/self-scheduled.yml
vendored
@@ -27,6 +27,7 @@ env:
|
||||
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||
RUN_PT_TF_CROSS_TESTS: 1
|
||||
CUDA_VISIBLE_DEVICES: 0,1
|
||||
NUM_SLICES: 2
|
||||
|
||||
jobs:
|
||||
setup:
|
||||
@@ -39,7 +40,8 @@ jobs:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
outputs:
|
||||
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||
folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
|
||||
slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
@@ -61,133 +63,27 @@ jobs:
|
||||
name: Identify models to test
|
||||
working-directory: /transformers/tests
|
||||
run: |
|
||||
echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')" >> $GITHUB_OUTPUT
|
||||
echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
|
||||
echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
run_tests_single_gpu:
|
||||
name: Model tests
|
||||
run_tests_gpu:
|
||||
name: " "
|
||||
needs: setup
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
||||
machine_type: [single-gpu]
|
||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
|
||||
# set the artifact folder names (because the character `/` is not allowed).
|
||||
run: |
|
||||
echo "${{ matrix.folders }}"
|
||||
matrix_folders=${{ matrix.folders }}
|
||||
matrix_folders=${matrix_folders/'models/'/'models_'}
|
||||
echo "$matrix_folders"
|
||||
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
||||
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
run: git fetch && git checkout ${{ github.sha }}
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: /transformers
|
||||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Environment
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
working-directory: /transformers
|
||||
run: pip freeze
|
||||
|
||||
- name: Run all tests on GPU
|
||||
working-directory: /transformers
|
||||
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
|
||||
path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
|
||||
|
||||
run_tests_multi_gpu:
|
||||
name: Model tests
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
||||
machine_type: [multi-gpu]
|
||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
|
||||
# set the artifact folder names (because the character `/` is not allowed).
|
||||
run: |
|
||||
echo "${{ matrix.folders }}"
|
||||
matrix_folders=${{ matrix.folders }}
|
||||
matrix_folders=${matrix_folders/'models/'/'models_'}
|
||||
echo "$matrix_folders"
|
||||
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
||||
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
run: git fetch && git checkout ${{ github.sha }}
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: /transformers
|
||||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Environment
|
||||
working-directory: /transformers
|
||||
run: |
|
||||
python3 utils/print_env.py
|
||||
|
||||
- name: Show installed libraries and their versions
|
||||
working-directory: /transformers
|
||||
run: pip freeze
|
||||
|
||||
- name: Run all tests on GPU
|
||||
working-directory: /transformers
|
||||
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
|
||||
|
||||
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports"
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
|
||||
path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
|
||||
uses: ./.github/workflows/model_jobs.yml
|
||||
with:
|
||||
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||
machine_type: ${{ matrix.machine_type }}
|
||||
slice_id: ${{ matrix.slice_id }}
|
||||
secrets: inherit
|
||||
|
||||
run_examples_gpu:
|
||||
name: Examples directory
|
||||
@@ -407,8 +303,7 @@ jobs:
|
||||
if: always()
|
||||
needs: [
|
||||
setup,
|
||||
run_tests_single_gpu,
|
||||
run_tests_multi_gpu,
|
||||
run_tests_gpu,
|
||||
run_examples_gpu,
|
||||
run_pipelines_tf_gpu,
|
||||
run_pipelines_torch_gpu,
|
||||
@@ -455,8 +350,7 @@ jobs:
|
||||
if: always()
|
||||
needs: [
|
||||
setup,
|
||||
run_tests_single_gpu,
|
||||
run_tests_multi_gpu,
|
||||
run_tests_gpu,
|
||||
run_examples_gpu,
|
||||
run_pipelines_tf_gpu,
|
||||
run_pipelines_torch_gpu,
|
||||
@@ -490,7 +384,7 @@ jobs:
|
||||
sudo apt-get install -y curl
|
||||
pip install slack_sdk
|
||||
pip show slack_sdk
|
||||
python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
|
||||
python utils/notification_service.py "${{ needs.setup.outputs.folder_slices }}"
|
||||
|
||||
# Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
|
||||
- name: Failure table artifacts
|
||||
|
||||
Reference in New Issue
Block a user