Files
HuggingFace_transformer/.github/workflows/self-scheduled.yml
Yih-Dar 38043d8453 Update self-push workflow (#17177)
* update push ci

* install git-python

* update comment

* update deepspeed jobs

* fix report

* skip 2 more tests that require fairscale

* Fix changes in test_fetcher.py (to deal with `setup.py` is changed)

* set RUN_PT_TF_CROSS_TESTS=1 and final clean-up

* remove SIGOPT_API_TOKEN

* remove echo "$matrix_folders"

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2022-05-13 16:28:00 +02:00

362 lines
14 KiB
YAML

name: Self-hosted runner (scheduled)
# Note that each job's dependencies go into a corresponding docker file.
#
# For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is
# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at
# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`
on:
repository_dispatch:
schedule:
- cron: "0 2 * * *"
env:
HF_HOME: /mnt/cache
TRANSFORMERS_IS_CI: yes
OMP_NUM_THREADS: 8
MKL_NUM_THREADS: 8
RUN_SLOW: yes
SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
TF_FORCE_GPU_ALLOW_GROWTH: true
RUN_PT_TF_CROSS_TESTS: 1
jobs:
setup:
name: Setup
strategy:
matrix:
machines: [multi-gpu-docker, single-gpu-docker]
runs-on: ${{ matrix.machines }}
container:
image: huggingface/transformers-all-latest-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- name: Update clone
working-directory: /transformers
run: |
git fetch && git checkout ${{ github.sha }}
- name: Cleanup
working-directory: /transformers
run: |
rm -rf tests/__pycache__
rm -rf tests/models/__pycache__
rm -rf reports
- id: set-matrix
name: Identify models to test
working-directory: /transformers/tests
run: |
echo "::set-output name=matrix::$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')"
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: GPU visibility
working-directory: /transformers
run: |
utils/print_env_pt.py
TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
run_tests_single_gpu:
name: Model tests
strategy:
fail-fast: false
matrix:
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
machines: [single-gpu-docker]
runs-on: ${{ matrix.machines }}
container:
image: huggingface/transformers-all-latest-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: setup
steps:
- name: Echo folder ${{ matrix.folders }}
shell: bash
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
# set the artifact folder names (because the character `/` is not allowed).
run: |
echo "${{ matrix.folders }}"
matrix_folders=${{ matrix.folders }}
matrix_folders=${matrix_folders/'models/'/'models_'}
echo "$matrix_folders"
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
# Set machine type, i.e. `single-gpu` or `multi-gpu`. Here we just remove `-docker`.
- name: Set machine type from ${{ matrix.machines }}
shell: bash
run: |
machine_type=${{ matrix.machines }}
machine_type=${machine_type/'-docker'/''}
echo "machine_type=$machine_type"
echo "machine_type=$machine_type" >> $GITHUB_ENV
- name: Update clone
working-directory: /transformers
run: git fetch && git checkout ${{ github.sha }}
- name: Run all tests on GPU
working-directory: /transformers
run: python3 -m pytest -v --make-reports=${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
path: /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}
run_tests_multi_gpu:
name: Model tests
strategy:
fail-fast: false
matrix:
folders: ${{ fromJson(needs.setup.outputs.matrix) }}
machines: [multi-gpu-docker]
runs-on: ${{ matrix.machines }}
container:
image: huggingface/transformers-all-latest-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: setup
steps:
- name: Echo folder ${{ matrix.folders }}
shell: bash
# For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
# set the artifact folder names (because the character `/` is not allowed).
run: |
echo "${{ matrix.folders }}"
matrix_folders=${{ matrix.folders }}
matrix_folders=${matrix_folders/'models/'/'models_'}
echo "$matrix_folders"
echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
# Set machine type, i.e. `single-gpu` or `multi-gpu`. Here we just remove `-docker`.
- name: Set machine type from ${{ matrix.machines }}
shell: bash
run: |
machine_type=${{ matrix.machines }}
machine_type=${machine_type/'-docker'/''}
echo "machine_type=$machine_type"
echo "machine_type=$machine_type" >> $GITHUB_ENV
- name: Update clone
working-directory: /transformers
run: git fetch && git checkout ${{ github.sha }}
- name: Run all tests on GPU
working-directory: /transformers
run: python3 -m pytest -v --make-reports=${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
path: /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}
run_examples_gpu:
name: Examples directory
runs-on: [self-hosted, single-gpu-docker]
container:
image: huggingface/transformers-all-latest-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: setup
steps:
- name: Update clone
working-directory: /transformers
run: git fetch && git checkout ${{ github.sha }}
- name: Run examples tests on GPU
working-directory: /transformers
run: |
pip install -r examples/pytorch/_tests_requirements.txt
python3 -m pytest -v --make-reports=examples_gpu examples/pytorch
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /transformers/reports/examples_gpu/failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: run_examples_gpu
path: /transformers/reports/examples_gpu
run_pipelines_torch_gpu:
name: PyTorch pipelines
strategy:
fail-fast: false
matrix:
machines: [multi-gpu-docker, single-gpu-docker]
runs-on: ${{ matrix.machines }}
container:
image: huggingface/transformers-pytorch-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: setup
steps:
# Set machine type, i.e. `single-gpu` or `multi-gpu`. Here we just remove `-docker`.
- name: Set machine type from ${{ matrix.machines }}
shell: bash
run: |
machine_type=${{ matrix.machines }}
machine_type=${machine_type/'-docker'/''}
echo "machine_type=$machine_type"
echo "machine_type=$machine_type" >> $GITHUB_ENV
- name: Update clone
working-directory: /transformers
run: git fetch && git checkout ${{ github.sha }}
- name: Run all pipeline tests on GPU
working-directory: /transformers
env:
RUN_PIPELINE_TESTS: yes
run: |
python3 -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=${{ env.machine_type }}_tests_torch_pipeline_gpu tests
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /transformers/reports/${{ env.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: ${{ env.machine_type }}_run_tests_torch_pipeline_gpu
path: /transformers/reports/${{ env.machine_type }}_tests_torch_pipeline_gpu
run_pipelines_tf_gpu:
name: TensorFlow pipelines
strategy:
fail-fast: false
matrix:
machines: [multi-gpu-docker, single-gpu-docker]
runs-on: ${{ matrix.machines }}
container:
image: huggingface/transformers-tensorflow-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: setup
steps:
# Set machine type, i.e. `single-gpu` or `multi-gpu`. Here we just remove `-docker`.
- name: Set machine type from ${{ matrix.machines }}
shell: bash
run: |
machine_type=${{ matrix.machines }}
machine_type=${machine_type/'-docker'/''}
echo "machine_type=$machine_type"
echo "machine_type=$machine_type" >> $GITHUB_ENV
- name: Update clone
working-directory: /transformers
run: |
git fetch && git checkout ${{ github.sha }}
- name: Run all pipeline tests on GPU
working-directory: /transformers
env:
RUN_PIPELINE_TESTS: yes
run: |
python3 -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=${{ env.machine_type }}_tests_tf_pipeline_gpu tests
- name: Failure short reports
if: ${{ always() }}
run: |
cat /transformers/reports/${{ env.machine_type }}_tests_tf_pipeline_gpu/failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: ${{ env.machine_type }}_run_tests_tf_pipeline_gpu
path: /transformers/reports/${{ env.machine_type }}_tests_tf_pipeline_gpu
run_all_tests_torch_cuda_extensions_gpu:
name: Torch CUDA extension tests
strategy:
fail-fast: false
matrix:
machines: [multi-gpu-docker, single-gpu-docker]
runs-on: ${{ matrix.machines }}
needs: setup
container:
image: huggingface/transformers-pytorch-deepspeed-latest-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
# Set machine type, i.e. `single-gpu` or `multi-gpu`. Here we just remove `-docker`.
- name: Set machine type from ${{ matrix.machines }}
shell: bash
run: |
machine_type=${{ matrix.machines }}
machine_type=${machine_type/'-docker'/''}
echo "machine_type=$machine_type"
echo "machine_type=$machine_type" >> $GITHUB_ENV
- name: Update clone
working-directory: /workspace/transformers
run: git fetch && git checkout ${{ github.sha }}
- name: Re-compile DeepSpeed
working-directory: /workspace
run: |
pip install deepspeed # installs the deps correctly
rm -rf DeepSpeed
git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install -e . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
- name: Run all tests on GPU
working-directory: /workspace/transformers
run: |
python -m pytest -v --make-reports=${{ env.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat /workspace/transformers/reports/${{ env.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: ${{ env.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports
path: /workspace/transformers/reports/${{ env.machine_type }}_tests_torch_cuda_extensions_gpu
send_results:
name: Send results to webhook
runs-on: ubuntu-latest
if: always()
needs: [setup, run_tests_single_gpu, run_tests_multi_gpu, run_examples_gpu, run_pipelines_tf_gpu, run_pipelines_torch_gpu, run_all_tests_torch_cuda_extensions_gpu]
steps:
- uses: actions/checkout@v2
- uses: actions/download-artifact@v2
- name: Send message to Slack
env:
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
CI_EVENT: scheduled
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
run: |
pip install slack_sdk
python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"