From 7a8118947f3c6a802a9f63dc22c394961d38860f Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Wed, 7 Sep 2022 12:51:37 +0200 Subject: [PATCH] Add checks for more workflow jobs (#18905) * add check for scheduled CI * Add check to other CIs Co-authored-by: ydshieh --- .github/workflows/self-nightly-scheduled.yml | 32 +++++++++++-- .github/workflows/self-past.yml | 30 +++++++++++-- .github/workflows/self-scheduled.yml | 47 +++++++++++++++++--- 3 files changed, 95 insertions(+), 14 deletions(-) diff --git a/.github/workflows/self-nightly-scheduled.yml b/.github/workflows/self-nightly-scheduled.yml index f1822be911..2b3283abf3 100644 --- a/.github/workflows/self-nightly-scheduled.yml +++ b/.github/workflows/self-nightly-scheduled.yml @@ -23,8 +23,23 @@ env: RUN_PT_TF_CROSS_TESTS: 1 jobs: + run_check_runners: + name: Check Runners + strategy: + matrix: + machine_type: [single-gpu, multi-gpu] + runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + container: + image: huggingface/transformers-all-latest-torch-nightly-gpu + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: NVIDIA-SMI + run: | + nvidia-smi + setup: name: Setup + needs: run_check_runners strategy: matrix: machine_type: [single-gpu, multi-gpu] @@ -68,7 +83,7 @@ jobs: container: image: huggingface/transformers-all-latest-torch-nightly-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup + needs: [run_check_runners, setup] steps: - name: Echo folder ${{ matrix.folders }} shell: bash @@ -121,7 +136,7 @@ jobs: container: image: huggingface/transformers-all-latest-torch-nightly-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup + needs: [run_check_runners, setup] steps: - name: Echo folder ${{ matrix.folders }} shell: bash @@ -170,7 +185,7 @@ jobs: matrix: machine_type: [single-gpu, multi-gpu] runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} - needs: setup + needs: [run_check_runners, setup] container: image: huggingface/transformers-pytorch-deepspeed-nightly-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -221,8 +236,15 @@ jobs: name: Send results to webhook runs-on: ubuntu-latest if: always() - needs: [setup, run_tests_single_gpu, run_tests_multi_gpu, run_all_tests_torch_cuda_extensions_gpu] + needs: [run_check_runners, setup, run_tests_single_gpu, run_tests_multi_gpu, run_all_tests_torch_cuda_extensions_gpu] steps: + - name: Preliminary job status + shell: bash + # For the meaning of these environment variables, see the job `Setup` + run: | + echo "Runner status: ${{ needs.run_check_runners.result }}" + echo "Setup status: ${{ needs.setup.result }}" + - uses: actions/checkout@v2 - uses: actions/download-artifact@v2 - name: Send message to Slack @@ -233,6 +255,8 @@ jobs: CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }} CI_EVENT: nightly-build + SETUP_STATUS: ${{ needs.setup.result }} + RUNNER_STATUS: ${{ needs.run_check_runners.result }} # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. run: | diff --git a/.github/workflows/self-past.yml b/.github/workflows/self-past.yml index b3871dc92f..8e9130023b 100644 --- a/.github/workflows/self-past.yml +++ b/.github/workflows/self-past.yml @@ -50,6 +50,21 @@ jobs: cd tests echo "::set-output name=matrix::$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')" + run_check_runners: + name: Check Runners + needs: setup + strategy: + matrix: + machine_type: [single-gpu, multi-gpu] + runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} + container: + image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: NVIDIA-SMI + run: | + nvidia-smi + run_tests_single_gpu: name: Model tests strategy: @@ -61,7 +76,7 @@ jobs: container: image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup + needs: [setup, run_check_runners] steps: - name: Update clone working-directory: /transformers @@ -114,7 +129,7 @@ jobs: container: image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup + needs: [setup, run_check_runners] steps: - name: Update clone working-directory: /transformers @@ -160,8 +175,15 @@ jobs: name: Send results to webhook runs-on: ubuntu-latest if: always() - needs: [setup, run_tests_single_gpu, run_tests_multi_gpu] + needs: [setup, run_check_runners, run_tests_single_gpu, run_tests_multi_gpu] steps: + - name: Preliminary job status + shell: bash + # For the meaning of these environment variables, see the job `Setup` + run: | + echo "Runner status: ${{ needs.run_check_runners.result }}" + echo "Setup status: ${{ needs.setup.result }}" + - uses: actions/checkout@v2 - uses: actions/download-artifact@v2 @@ -177,6 +199,8 @@ jobs: CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }} CI_EVENT: Past CI - ${{ inputs.framework }}-${{ inputs.version }} + SETUP_STATUS: ${{ needs.setup.result }} + RUNNER_STATUS: ${{ needs.run_check_runners.result }} # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. run: | diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 82d33babf4..a0a10921ae 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -22,8 +22,23 @@ env: RUN_PT_TF_CROSS_TESTS: 1 jobs: + run_check_runners: + name: Check Runners + strategy: + matrix: + machine_type: [single-gpu, multi-gpu] + runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + container: + image: huggingface/transformers-all-latest-gpu + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: NVIDIA-SMI + run: | + nvidia-smi + setup: name: Setup + needs: run_check_runners strategy: matrix: machine_type: [single-gpu, multi-gpu] @@ -67,7 +82,7 @@ jobs: container: image: huggingface/transformers-all-latest-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup + needs: [run_check_runners, setup] steps: - name: Echo folder ${{ matrix.folders }} shell: bash @@ -120,7 +135,7 @@ jobs: container: image: huggingface/transformers-all-latest-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup + needs: [run_check_runners, setup] steps: - name: Echo folder ${{ matrix.folders }} shell: bash @@ -168,7 +183,7 @@ jobs: container: image: huggingface/transformers-all-latest-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup + needs: [run_check_runners, setup] steps: - name: Update clone working-directory: /transformers @@ -211,7 +226,7 @@ jobs: container: image: huggingface/transformers-pytorch-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup + needs: [run_check_runners, setup] steps: - name: Update clone working-directory: /transformers @@ -255,7 +270,7 @@ jobs: container: image: huggingface/transformers-tensorflow-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup + needs: [run_check_runners, setup] steps: - name: Update clone working-directory: /transformers @@ -297,7 +312,7 @@ jobs: matrix: machine_type: [single-gpu, multi-gpu] runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} - needs: setup + needs: [run_check_runners, setup] container: image: huggingface/transformers-pytorch-deepspeed-latest-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -346,8 +361,24 @@ jobs: name: Send results to webhook runs-on: ubuntu-latest if: always() - needs: [setup, run_tests_single_gpu, run_tests_multi_gpu, run_examples_gpu, run_pipelines_tf_gpu, run_pipelines_torch_gpu, run_all_tests_torch_cuda_extensions_gpu] + needs: [ + run_check_runners, + setup, + run_tests_single_gpu, + run_tests_multi_gpu, + run_examples_gpu, + run_pipelines_tf_gpu, + run_pipelines_torch_gpu, + run_all_tests_torch_cuda_extensions_gpu + ] steps: + - name: Preliminary job status + shell: bash + # For the meaning of these environment variables, see the job `Setup` + run: | + echo "Runner status: ${{ needs.run_check_runners.result }}" + echo "Setup status: ${{ needs.setup.result }}" + - uses: actions/checkout@v2 - uses: actions/download-artifact@v2 - name: Send message to Slack @@ -358,6 +389,8 @@ jobs: CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} CI_EVENT: scheduled + SETUP_STATUS: ${{ needs.setup.result }} + RUNNER_STATUS: ${{ needs.run_check_runners.result }} # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. run: |