diff --git a/.github/workflows/model_jobs.yml b/.github/workflows/model_jobs.yml new file mode 100644 index 0000000000..8bf8d78570 --- /dev/null +++ b/.github/workflows/model_jobs.yml @@ -0,0 +1,102 @@ +name: model jobs + +on: + workflow_call: + inputs: + folder_slices: + required: true + type: string + machine_type: + required: true + type: string + slice_id: + required: true + type: number + +env: + HF_HOME: /mnt/cache + TRANSFORMERS_IS_CI: yes + OMP_NUM_THREADS: 8 + MKL_NUM_THREADS: 8 + RUN_SLOW: yes + # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. + # This token is created under the bot `hf-transformers-bot`. + HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} + TF_FORCE_GPU_ALLOW_GROWTH: true + RUN_PT_TF_CROSS_TESTS: 1 + CUDA_VISIBLE_DEVICES: 0,1 + +jobs: + model_job: + name: " " + strategy: + fail-fast: false + matrix: + folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }} + runs-on: ['${{ inputs.machine_type }}', nvidia-gpu, t4, daily-ci] + container: + image: huggingface/transformers-all-latest-gpu + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Echo input and matrix info + shell: bash + run: | + echo "${{ inputs.folder_slices }}" + echo "${{ matrix.folders }}" + echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}" + + - name: Echo folder ${{ matrix.folders }} + shell: bash + # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # set the artifact folder names (because the character `/` is not allowed). + run: | + echo "${{ matrix.folders }}" + matrix_folders=${{ matrix.folders }} + matrix_folders=${matrix_folders/'models/'/'models_'} + echo "$matrix_folders" + echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all tests on GPU + working-directory: /transformers + run: python3 -m pytest -v --make-reports=${{ inputs.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ inputs.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + + - name: Run test + shell: bash + run: | + mkdir -p /transformers/reports/${{ inputs.machine_type }}_tests_gpu_${{ matrix.folders }} + echo "hello" > /transformers/reports/${{ inputs.machine_type }}_tests_gpu_${{ matrix.folders }}/hello.txt + echo "${{ inputs.machine_type }}_tests_gpu_${{ matrix.folders }}" + + - name: "Test suite reports artifacts: ${{ inputs.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports" + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ inputs.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ inputs.machine_type }}_tests_gpu_${{ matrix.folders }} diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index d4b84983cd..d44e9a29ec 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -27,6 +27,7 @@ env: TF_FORCE_GPU_ALLOW_GROWTH: true RUN_PT_TF_CROSS_TESTS: 1 CUDA_VISIBLE_DEVICES: 0,1 + NUM_SLICES: 2 jobs: setup: @@ -39,7 +40,8 @@ jobs: image: huggingface/transformers-all-latest-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} + folder_slices: ${{ steps.set-matrix.outputs.folder_slices }} + slice_ids: ${{ steps.set-matrix.outputs.slice_ids }} steps: - name: Update clone working-directory: /transformers @@ -61,133 +63,27 @@ jobs: name: Identify models to test working-directory: /transformers/tests run: | - echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')" >> $GITHUB_OUTPUT + echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT + echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT - name: NVIDIA-SMI run: | nvidia-smi - run_tests_single_gpu: - name: Model tests + run_tests_gpu: + name: " " + needs: setup strategy: fail-fast: false matrix: - folders: ${{ fromJson(needs.setup.outputs.matrix) }} - machine_type: [single-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] - container: - image: huggingface/transformers-all-latest-gpu - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Echo folder ${{ matrix.folders }} - shell: bash - # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to - # set the artifact folder names (because the character `/` is not allowed). - run: | - echo "${{ matrix.folders }}" - matrix_folders=${{ matrix.folders }} - matrix_folders=${matrix_folders/'models/'/'models_'} - echo "$matrix_folders" - echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all tests on GPU - working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports" - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} - - run_tests_multi_gpu: - name: Model tests - strategy: - fail-fast: false - matrix: - folders: ${{ fromJson(needs.setup.outputs.matrix) }} - machine_type: [multi-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] - container: - image: huggingface/transformers-all-latest-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Echo folder ${{ matrix.folders }} - shell: bash - # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to - # set the artifact folder names (because the character `/` is not allowed). - run: | - echo "${{ matrix.folders }}" - matrix_folders=${{ matrix.folders }} - matrix_folders=${matrix_folders/'models/'/'models_'} - echo "$matrix_folders" - echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all tests on GPU - working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports" - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} + machine_type: [single-gpu, multi-gpu] + slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }} + uses: ./.github/workflows/model_jobs.yml + with: + folder_slices: ${{ needs.setup.outputs.folder_slices }} + machine_type: ${{ matrix.machine_type }} + slice_id: ${{ matrix.slice_id }} + secrets: inherit run_examples_gpu: name: Examples directory @@ -407,8 +303,7 @@ jobs: if: always() needs: [ setup, - run_tests_single_gpu, - run_tests_multi_gpu, + run_tests_gpu, run_examples_gpu, run_pipelines_tf_gpu, run_pipelines_torch_gpu, @@ -455,8 +350,7 @@ jobs: if: always() needs: [ setup, - run_tests_single_gpu, - run_tests_multi_gpu, + run_tests_gpu, run_examples_gpu, run_pipelines_tf_gpu, run_pipelines_torch_gpu, @@ -490,7 +384,7 @@ jobs: sudo apt-get install -y curl pip install slack_sdk pip show slack_sdk - python utils/notification_service.py "${{ needs.setup.outputs.matrix }}" + python utils/notification_service.py "${{ needs.setup.outputs.folder_slices }}" # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack. - name: Failure table artifacts diff --git a/utils/notification_service.py b/utils/notification_service.py index 27adf054f2..39a0fb840c 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -931,9 +931,9 @@ if __name__ == "__main__": arguments = sys.argv[1:][0] try: - models = ast.literal_eval(arguments) + folder_slices = ast.literal_eval(arguments) # Need to change from elements like `models/bert` to `models_bert` (the ones used as artifact names). - models = [x.replace("models/", "models_") for x in models] + models = [x.replace("models/", "models_") for folders in folder_slices for x in folders] except SyntaxError: Message.error_out(title, ci_title) raise ValueError("Errored out.") diff --git a/utils/split_model_tests.py b/utils/split_model_tests.py new file mode 100644 index 0000000000..fc8800ffcf --- /dev/null +++ b/utils/split_model_tests.py @@ -0,0 +1,65 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This script is used to get the list of folders under `tests/models` and split the list into `NUM_SLICES` splits. +The main use case is a GitHub Actions workflow file calling this script to get the (nested) list of folders allowing it +to split the list of jobs to run into multiple slices each containing a smaller number of jobs. This way, we can bypass +the maximum of 256 jobs in a matrix. + +See the `setup` and `run_tests_gpu` jobs defined in the workflow file `.github/workflows/self-scheduled.yml` for more +details. + +Usage: + +This script is required to be run under `tests` folder of `transformers` root directory. + +Assume we are under `transformers` root directory: +```bash +cd tests +python ../utils/split_model_tests.py --num_splits 64 +``` +""" + +import argparse +import os + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--num_splits", + type=int, + default=1, + help="the number of splits into which the (flat) list of folders will be split.", + ) + args = parser.parse_args() + + tests = os.getcwd() + model_tests = os.listdir(os.path.join(tests, "models")) + d1 = sorted(filter(os.path.isdir, os.listdir(tests))) + d2 = sorted(filter(os.path.isdir, [f"models/{x}" for x in model_tests])) + d1.remove("models") + d = d2 + d1 + + num_jobs = len(d) + num_jobs_per_splits = num_jobs // args.num_splits + + model_splits = [] + end = 0 + for idx in range(args.num_splits): + start = end + end = start + num_jobs_per_splits + (1 if idx < num_jobs % args.num_splits else 0) + model_splits.append(d[start:end]) + print(model_splits)