Split daily CI using 2 level matrix (#28773)
* update / add new workflow files * Add comment * Use env.NUM_SLICES * use scripts * use scripts * use scripts * Fix * using one script * Fix * remove unused file * update * fail-fast: false * remove unused file * fix * fix * use matrix * inputs * style * update * fix * fix * no model name * add doc * allow args * style * pass argument --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
102
.github/workflows/model_jobs.yml
vendored
Normal file
102
.github/workflows/model_jobs.yml
vendored
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
name: model jobs
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_call:
|
||||||
|
inputs:
|
||||||
|
folder_slices:
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
machine_type:
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
slice_id:
|
||||||
|
required: true
|
||||||
|
type: number
|
||||||
|
|
||||||
|
env:
|
||||||
|
HF_HOME: /mnt/cache
|
||||||
|
TRANSFORMERS_IS_CI: yes
|
||||||
|
OMP_NUM_THREADS: 8
|
||||||
|
MKL_NUM_THREADS: 8
|
||||||
|
RUN_SLOW: yes
|
||||||
|
# For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
|
||||||
|
# This token is created under the bot `hf-transformers-bot`.
|
||||||
|
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||||
|
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
|
||||||
|
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||||
|
RUN_PT_TF_CROSS_TESTS: 1
|
||||||
|
CUDA_VISIBLE_DEVICES: 0,1
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
model_job:
|
||||||
|
name: " "
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
|
||||||
|
runs-on: ['${{ inputs.machine_type }}', nvidia-gpu, t4, daily-ci]
|
||||||
|
container:
|
||||||
|
image: huggingface/transformers-all-latest-gpu
|
||||||
|
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
|
steps:
|
||||||
|
- name: Echo input and matrix info
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
echo "${{ inputs.folder_slices }}"
|
||||||
|
echo "${{ matrix.folders }}"
|
||||||
|
echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
|
||||||
|
|
||||||
|
- name: Echo folder ${{ matrix.folders }}
|
||||||
|
shell: bash
|
||||||
|
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
|
||||||
|
# set the artifact folder names (because the character `/` is not allowed).
|
||||||
|
run: |
|
||||||
|
echo "${{ matrix.folders }}"
|
||||||
|
matrix_folders=${{ matrix.folders }}
|
||||||
|
matrix_folders=${matrix_folders/'models/'/'models_'}
|
||||||
|
echo "$matrix_folders"
|
||||||
|
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Update clone
|
||||||
|
working-directory: /transformers
|
||||||
|
run: git fetch && git checkout ${{ github.sha }}
|
||||||
|
|
||||||
|
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||||
|
working-directory: /transformers
|
||||||
|
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
|
||||||
|
|
||||||
|
- name: NVIDIA-SMI
|
||||||
|
run: |
|
||||||
|
nvidia-smi
|
||||||
|
|
||||||
|
- name: Environment
|
||||||
|
working-directory: /transformers
|
||||||
|
run: |
|
||||||
|
python3 utils/print_env.py
|
||||||
|
|
||||||
|
- name: Show installed libraries and their versions
|
||||||
|
working-directory: /transformers
|
||||||
|
run: pip freeze
|
||||||
|
|
||||||
|
- name: Run all tests on GPU
|
||||||
|
working-directory: /transformers
|
||||||
|
run: python3 -m pytest -v --make-reports=${{ inputs.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
|
||||||
|
|
||||||
|
- name: Failure short reports
|
||||||
|
if: ${{ failure() }}
|
||||||
|
continue-on-error: true
|
||||||
|
run: cat /transformers/reports/${{ inputs.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
|
||||||
|
|
||||||
|
- name: Run test
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
mkdir -p /transformers/reports/${{ inputs.machine_type }}_tests_gpu_${{ matrix.folders }}
|
||||||
|
echo "hello" > /transformers/reports/${{ inputs.machine_type }}_tests_gpu_${{ matrix.folders }}/hello.txt
|
||||||
|
echo "${{ inputs.machine_type }}_tests_gpu_${{ matrix.folders }}"
|
||||||
|
|
||||||
|
- name: "Test suite reports artifacts: ${{ inputs.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports"
|
||||||
|
if: ${{ always() }}
|
||||||
|
uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
name: ${{ inputs.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
|
||||||
|
path: /transformers/reports/${{ inputs.machine_type }}_tests_gpu_${{ matrix.folders }}
|
||||||
142
.github/workflows/self-scheduled.yml
vendored
142
.github/workflows/self-scheduled.yml
vendored
@@ -27,6 +27,7 @@ env:
|
|||||||
TF_FORCE_GPU_ALLOW_GROWTH: true
|
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||||
RUN_PT_TF_CROSS_TESTS: 1
|
RUN_PT_TF_CROSS_TESTS: 1
|
||||||
CUDA_VISIBLE_DEVICES: 0,1
|
CUDA_VISIBLE_DEVICES: 0,1
|
||||||
|
NUM_SLICES: 2
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
setup:
|
setup:
|
||||||
@@ -39,7 +40,8 @@ jobs:
|
|||||||
image: huggingface/transformers-all-latest-gpu
|
image: huggingface/transformers-all-latest-gpu
|
||||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
outputs:
|
outputs:
|
||||||
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
|
||||||
|
slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
|
||||||
steps:
|
steps:
|
||||||
- name: Update clone
|
- name: Update clone
|
||||||
working-directory: /transformers
|
working-directory: /transformers
|
||||||
@@ -61,133 +63,27 @@ jobs:
|
|||||||
name: Identify models to test
|
name: Identify models to test
|
||||||
working-directory: /transformers/tests
|
working-directory: /transformers/tests
|
||||||
run: |
|
run: |
|
||||||
echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')" >> $GITHUB_OUTPUT
|
echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
|
||||||
|
echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
- name: NVIDIA-SMI
|
- name: NVIDIA-SMI
|
||||||
run: |
|
run: |
|
||||||
nvidia-smi
|
nvidia-smi
|
||||||
|
|
||||||
run_tests_single_gpu:
|
run_tests_gpu:
|
||||||
name: Model tests
|
name: " "
|
||||||
|
needs: setup
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
machine_type: [single-gpu, multi-gpu]
|
||||||
machine_type: [single-gpu]
|
slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
|
||||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
|
uses: ./.github/workflows/model_jobs.yml
|
||||||
container:
|
|
||||||
image: huggingface/transformers-all-latest-gpu
|
|
||||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
|
||||||
needs: setup
|
|
||||||
steps:
|
|
||||||
- name: Echo folder ${{ matrix.folders }}
|
|
||||||
shell: bash
|
|
||||||
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
|
|
||||||
# set the artifact folder names (because the character `/` is not allowed).
|
|
||||||
run: |
|
|
||||||
echo "${{ matrix.folders }}"
|
|
||||||
matrix_folders=${{ matrix.folders }}
|
|
||||||
matrix_folders=${matrix_folders/'models/'/'models_'}
|
|
||||||
echo "$matrix_folders"
|
|
||||||
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
- name: Update clone
|
|
||||||
working-directory: /transformers
|
|
||||||
run: git fetch && git checkout ${{ github.sha }}
|
|
||||||
|
|
||||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
|
||||||
working-directory: /transformers
|
|
||||||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
|
|
||||||
|
|
||||||
- name: NVIDIA-SMI
|
|
||||||
run: |
|
|
||||||
nvidia-smi
|
|
||||||
|
|
||||||
- name: Environment
|
|
||||||
working-directory: /transformers
|
|
||||||
run: |
|
|
||||||
python3 utils/print_env.py
|
|
||||||
|
|
||||||
- name: Show installed libraries and their versions
|
|
||||||
working-directory: /transformers
|
|
||||||
run: pip freeze
|
|
||||||
|
|
||||||
- name: Run all tests on GPU
|
|
||||||
working-directory: /transformers
|
|
||||||
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
|
|
||||||
|
|
||||||
- name: Failure short reports
|
|
||||||
if: ${{ failure() }}
|
|
||||||
continue-on-error: true
|
|
||||||
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
|
|
||||||
|
|
||||||
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports"
|
|
||||||
if: ${{ always() }}
|
|
||||||
uses: actions/upload-artifact@v3
|
|
||||||
with:
|
with:
|
||||||
name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
|
folder_slices: ${{ needs.setup.outputs.folder_slices }}
|
||||||
path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
|
machine_type: ${{ matrix.machine_type }}
|
||||||
|
slice_id: ${{ matrix.slice_id }}
|
||||||
run_tests_multi_gpu:
|
secrets: inherit
|
||||||
name: Model tests
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
|
|
||||||
machine_type: [multi-gpu]
|
|
||||||
runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
|
|
||||||
container:
|
|
||||||
image: huggingface/transformers-all-latest-gpu
|
|
||||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
|
||||||
needs: setup
|
|
||||||
steps:
|
|
||||||
- name: Echo folder ${{ matrix.folders }}
|
|
||||||
shell: bash
|
|
||||||
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
|
|
||||||
# set the artifact folder names (because the character `/` is not allowed).
|
|
||||||
run: |
|
|
||||||
echo "${{ matrix.folders }}"
|
|
||||||
matrix_folders=${{ matrix.folders }}
|
|
||||||
matrix_folders=${matrix_folders/'models/'/'models_'}
|
|
||||||
echo "$matrix_folders"
|
|
||||||
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
|
|
||||||
|
|
||||||
- name: Update clone
|
|
||||||
working-directory: /transformers
|
|
||||||
run: git fetch && git checkout ${{ github.sha }}
|
|
||||||
|
|
||||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
|
||||||
working-directory: /transformers
|
|
||||||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
|
|
||||||
|
|
||||||
- name: NVIDIA-SMI
|
|
||||||
run: |
|
|
||||||
nvidia-smi
|
|
||||||
|
|
||||||
- name: Environment
|
|
||||||
working-directory: /transformers
|
|
||||||
run: |
|
|
||||||
python3 utils/print_env.py
|
|
||||||
|
|
||||||
- name: Show installed libraries and their versions
|
|
||||||
working-directory: /transformers
|
|
||||||
run: pip freeze
|
|
||||||
|
|
||||||
- name: Run all tests on GPU
|
|
||||||
working-directory: /transformers
|
|
||||||
run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
|
|
||||||
|
|
||||||
- name: Failure short reports
|
|
||||||
if: ${{ failure() }}
|
|
||||||
continue-on-error: true
|
|
||||||
run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
|
|
||||||
|
|
||||||
- name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports"
|
|
||||||
if: ${{ always() }}
|
|
||||||
uses: actions/upload-artifact@v3
|
|
||||||
with:
|
|
||||||
name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
|
|
||||||
path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
|
|
||||||
|
|
||||||
run_examples_gpu:
|
run_examples_gpu:
|
||||||
name: Examples directory
|
name: Examples directory
|
||||||
@@ -407,8 +303,7 @@ jobs:
|
|||||||
if: always()
|
if: always()
|
||||||
needs: [
|
needs: [
|
||||||
setup,
|
setup,
|
||||||
run_tests_single_gpu,
|
run_tests_gpu,
|
||||||
run_tests_multi_gpu,
|
|
||||||
run_examples_gpu,
|
run_examples_gpu,
|
||||||
run_pipelines_tf_gpu,
|
run_pipelines_tf_gpu,
|
||||||
run_pipelines_torch_gpu,
|
run_pipelines_torch_gpu,
|
||||||
@@ -455,8 +350,7 @@ jobs:
|
|||||||
if: always()
|
if: always()
|
||||||
needs: [
|
needs: [
|
||||||
setup,
|
setup,
|
||||||
run_tests_single_gpu,
|
run_tests_gpu,
|
||||||
run_tests_multi_gpu,
|
|
||||||
run_examples_gpu,
|
run_examples_gpu,
|
||||||
run_pipelines_tf_gpu,
|
run_pipelines_tf_gpu,
|
||||||
run_pipelines_torch_gpu,
|
run_pipelines_torch_gpu,
|
||||||
@@ -490,7 +384,7 @@ jobs:
|
|||||||
sudo apt-get install -y curl
|
sudo apt-get install -y curl
|
||||||
pip install slack_sdk
|
pip install slack_sdk
|
||||||
pip show slack_sdk
|
pip show slack_sdk
|
||||||
python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
|
python utils/notification_service.py "${{ needs.setup.outputs.folder_slices }}"
|
||||||
|
|
||||||
# Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
|
# Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
|
||||||
- name: Failure table artifacts
|
- name: Failure table artifacts
|
||||||
|
|||||||
@@ -931,9 +931,9 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
arguments = sys.argv[1:][0]
|
arguments = sys.argv[1:][0]
|
||||||
try:
|
try:
|
||||||
models = ast.literal_eval(arguments)
|
folder_slices = ast.literal_eval(arguments)
|
||||||
# Need to change from elements like `models/bert` to `models_bert` (the ones used as artifact names).
|
# Need to change from elements like `models/bert` to `models_bert` (the ones used as artifact names).
|
||||||
models = [x.replace("models/", "models_") for x in models]
|
models = [x.replace("models/", "models_") for folders in folder_slices for x in folders]
|
||||||
except SyntaxError:
|
except SyntaxError:
|
||||||
Message.error_out(title, ci_title)
|
Message.error_out(title, ci_title)
|
||||||
raise ValueError("Errored out.")
|
raise ValueError("Errored out.")
|
||||||
|
|||||||
65
utils/split_model_tests.py
Normal file
65
utils/split_model_tests.py
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
# Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
This script is used to get the list of folders under `tests/models` and split the list into `NUM_SLICES` splits.
|
||||||
|
The main use case is a GitHub Actions workflow file calling this script to get the (nested) list of folders allowing it
|
||||||
|
to split the list of jobs to run into multiple slices each containing a smaller number of jobs. This way, we can bypass
|
||||||
|
the maximum of 256 jobs in a matrix.
|
||||||
|
|
||||||
|
See the `setup` and `run_tests_gpu` jobs defined in the workflow file `.github/workflows/self-scheduled.yml` for more
|
||||||
|
details.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
|
||||||
|
This script is required to be run under `tests` folder of `transformers` root directory.
|
||||||
|
|
||||||
|
Assume we are under `transformers` root directory:
|
||||||
|
```bash
|
||||||
|
cd tests
|
||||||
|
python ../utils/split_model_tests.py --num_splits 64
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--num_splits",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help="the number of splits into which the (flat) list of folders will be split.",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
tests = os.getcwd()
|
||||||
|
model_tests = os.listdir(os.path.join(tests, "models"))
|
||||||
|
d1 = sorted(filter(os.path.isdir, os.listdir(tests)))
|
||||||
|
d2 = sorted(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))
|
||||||
|
d1.remove("models")
|
||||||
|
d = d2 + d1
|
||||||
|
|
||||||
|
num_jobs = len(d)
|
||||||
|
num_jobs_per_splits = num_jobs // args.num_splits
|
||||||
|
|
||||||
|
model_splits = []
|
||||||
|
end = 0
|
||||||
|
for idx in range(args.num_splits):
|
||||||
|
start = end
|
||||||
|
end = start + num_jobs_per_splits + (1 if idx < num_jobs % args.num_splits else 0)
|
||||||
|
model_splits.append(d[start:end])
|
||||||
|
print(model_splits)
|
||||||
Reference in New Issue
Block a user