Add checks for more workflow jobs (#18905)

* add check for scheduled CI * Add check to other CIs Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2022-09-07 12:51:37 +02:00
parent c25f27fa6a
commit 7a8118947f
3 changed files with 95 additions and 14 deletions
--- a/.github/workflows/self-nightly-scheduled.yml
+++ b/.github/workflows/self-nightly-scheduled.yml
@@ -23,8 +23,23 @@ env:
  RUN_PT_TF_CROSS_TESTS: 1

 jobs:
+  run_check_runners:
+    name: Check Runners
+    strategy:
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
+    container:
+      image: huggingface/transformers-all-latest-torch-nightly-gpu
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
  setup:
    name: Setup
+    needs: run_check_runners
    strategy:
      matrix:
        machine_type: [single-gpu, multi-gpu]
@@ -68,7 +83,7 @@ jobs:
    container:
      image: huggingface/transformers-all-latest-torch-nightly-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
+    needs: [run_check_runners, setup]
    steps:
      - name: Echo folder ${{ matrix.folders }}
        shell: bash
@@ -121,7 +136,7 @@ jobs:
    container:
      image: huggingface/transformers-all-latest-torch-nightly-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
+    needs: [run_check_runners, setup]
    steps:
      - name: Echo folder ${{ matrix.folders }}
        shell: bash
@@ -170,7 +185,7 @@ jobs:
      matrix:
        machine_type: [single-gpu, multi-gpu]
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
-    needs: setup
+    needs: [run_check_runners, setup]
    container:
      image: huggingface/transformers-pytorch-deepspeed-nightly-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -221,8 +236,15 @@ jobs:
    name: Send results to webhook
    runs-on: ubuntu-latest
    if: always()
-    needs: [setup, run_tests_single_gpu, run_tests_multi_gpu, run_all_tests_torch_cuda_extensions_gpu]
+    needs: [run_check_runners, setup, run_tests_single_gpu, run_tests_multi_gpu, run_all_tests_torch_cuda_extensions_gpu]
    steps:
+      - name: Preliminary job status
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          echo "Runner status: ${{ needs.run_check_runners.result }}"
+          echo "Setup status: ${{ needs.setup.result }}"
+
      - uses: actions/checkout@v2
      - uses: actions/download-artifact@v2
      - name: Send message to Slack
@@ -233,6 +255,8 @@ jobs:
          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
          CI_EVENT: nightly-build
+          SETUP_STATUS: ${{ needs.setup.result }}
+          RUNNER_STATUS: ${{ needs.run_check_runners.result }}
        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
        run: |
--- a/.github/workflows/self-past.yml
+++ b/.github/workflows/self-past.yml
@@ -50,6 +50,21 @@ jobs:
          cd tests
          echo "::set-output name=matrix::$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')"

+  run_check_runners:
+    name: Check Runners
+    needs: setup
+    strategy:
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
+    container:
+      image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
  run_tests_single_gpu:
    name: Model tests
    strategy:
@@ -61,7 +76,7 @@ jobs:
    container:
      image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
+    needs: [setup, run_check_runners]
    steps:
      - name: Update clone
        working-directory: /transformers
@@ -114,7 +129,7 @@ jobs:
    container:
      image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
+    needs: [setup, run_check_runners]
    steps:
      - name: Update clone
        working-directory: /transformers
@@ -160,8 +175,15 @@ jobs:
    name: Send results to webhook
    runs-on: ubuntu-latest
    if: always()
-    needs: [setup, run_tests_single_gpu, run_tests_multi_gpu]
+    needs: [setup, run_check_runners, run_tests_single_gpu, run_tests_multi_gpu]
    steps:
+      - name: Preliminary job status
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          echo "Runner status: ${{ needs.run_check_runners.result }}"
+          echo "Setup status: ${{ needs.setup.result }}"
+
      - uses: actions/checkout@v2
      - uses: actions/download-artifact@v2

@@ -177,6 +199,8 @@ jobs:
          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
          CI_EVENT: Past CI - ${{ inputs.framework }}-${{ inputs.version }}
+          SETUP_STATUS: ${{ needs.setup.result }}
+          RUNNER_STATUS: ${{ needs.run_check_runners.result }}
        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
        run: |
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -22,8 +22,23 @@ env:
  RUN_PT_TF_CROSS_TESTS: 1

 jobs:
+  run_check_runners:
+    name: Check Runners
+    strategy:
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
+    container:
+      image: huggingface/transformers-all-latest-gpu
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
  setup:
    name: Setup
+    needs: run_check_runners
    strategy:
      matrix:
        machine_type: [single-gpu, multi-gpu]
@@ -67,7 +82,7 @@ jobs:
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
+    needs: [run_check_runners, setup]
    steps:
      - name: Echo folder ${{ matrix.folders }}
        shell: bash
@@ -120,7 +135,7 @@ jobs:
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
+    needs: [run_check_runners, setup]
    steps:
      - name: Echo folder ${{ matrix.folders }}
        shell: bash
@@ -168,7 +183,7 @@ jobs:
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
+    needs: [run_check_runners, setup]
    steps:
      - name: Update clone
        working-directory: /transformers
@@ -211,7 +226,7 @@ jobs:
    container:
      image: huggingface/transformers-pytorch-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
+    needs: [run_check_runners, setup]
    steps:
      - name: Update clone
        working-directory: /transformers
@@ -255,7 +270,7 @@ jobs:
    container:
      image: huggingface/transformers-tensorflow-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
+    needs: [run_check_runners, setup]
    steps:
      - name: Update clone
        working-directory: /transformers
@@ -297,7 +312,7 @@ jobs:
      matrix:
        machine_type: [single-gpu, multi-gpu]
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
-    needs: setup
+    needs: [run_check_runners, setup]
    container:
      image: huggingface/transformers-pytorch-deepspeed-latest-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -346,8 +361,24 @@ jobs:
    name: Send results to webhook
    runs-on: ubuntu-latest
    if: always()
-    needs: [setup, run_tests_single_gpu, run_tests_multi_gpu, run_examples_gpu, run_pipelines_tf_gpu, run_pipelines_torch_gpu, run_all_tests_torch_cuda_extensions_gpu]
+    needs: [
+      run_check_runners,
+      setup,
+      run_tests_single_gpu,
+      run_tests_multi_gpu,
+      run_examples_gpu,
+      run_pipelines_tf_gpu,
+      run_pipelines_torch_gpu,
+      run_all_tests_torch_cuda_extensions_gpu
+    ]
    steps:
+      - name: Preliminary job status
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          echo "Runner status: ${{ needs.run_check_runners.result }}"
+          echo "Setup status: ${{ needs.setup.result }}"
+
      - uses: actions/checkout@v2
      - uses: actions/download-artifact@v2
      - name: Send message to Slack
@@ -358,6 +389,8 @@ jobs:
          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
          CI_EVENT: scheduled
+          SETUP_STATUS: ${{ needs.setup.result }}
+          RUNNER_STATUS: ${{ needs.run_check_runners.result }}
        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
        run: |