Add runner availability check (#19054)

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2022-09-19 12:27:06 +02:00
parent ca485e562b
commit ba7f2173cc
7 changed files with 257 additions and 71 deletions
--- a/.github/workflows/check_runner_status.yml
+++ b/.github/workflows/check_runner_status.yml
@@ -0,0 +1,57 @@
+name: Self-hosted runner (check runner status)
+
+# Note that each job's dependencies go into a corresponding docker file.
+#
+# For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is
+# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at
+# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`
+
+on:
+  repository_dispatch:
+  schedule:
+    # run per hour
+    - cron: "* */1 * * *"
+
+env:
+  TRANSFORMERS_IS_CI: yes
+
+jobs:
+  check_runner_status:
+    name: Check Runner Status
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout transformers
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 2
+
+      - name: Check Runner Status
+        run: python utils/check_self_hosted_runner.py --target_runners single-gpu-ci-runner-docker,multi-gpu-ci-runner-docker,single-gpu-scheduled-ci-runner-docker,multi-scheduled-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+
+  send_results:
+    name: Send results to webhook
+    runs-on: ubuntu-latest
+    needs: check_runner_status
+    if: ${{ failure() }}
+    steps:
+      - name: Preliminary job status
+        shell: bash
+        run: |
+          echo "Runner availability: ${{ needs.check_runner_status.result }}"
+
+      - uses: actions/checkout@v2
+      - uses: actions/download-artifact@v2
+      - name: Send message to Slack
+        env:
+          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
+          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
+          CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
+          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
+          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
+          CI_EVENT: runner status check
+          RUNNER_STATUS: ${{ needs.check_runner_status.result }}
+        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
+        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
+        run: |
+          pip install slack_sdk
+          python utils/notification_service.py
--- a/.github/workflows/self-nightly-scheduled.yml
+++ b/.github/workflows/self-nightly-scheduled.yml
@@ -23,8 +23,21 @@ env:
  RUN_PT_TF_CROSS_TESTS: 1

 jobs:
-  run_check_runners:
+  check_runner_status:
+    name: Check Runner Status
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout transformers
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 2
+
+      - name: Check Runner Status
+        run: python utils/check_self_hosted_runner.py --target_runners single-gpu-scheduled-ci-runner-docker,multi-gpu-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+
+  check_runners:
    name: Check Runners
+    needs: check_runner_status
    strategy:
      matrix:
        machine_type: [single-gpu, multi-gpu]
@@ -39,7 +52,7 @@ jobs:

  setup:
    name: Setup
-    needs: run_check_runners
+    needs: check_runners
    strategy:
      matrix:
        machine_type: [single-gpu, multi-gpu]
@@ -83,7 +96,7 @@ jobs:
    container:
      image: huggingface/transformers-all-latest-torch-nightly-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: [run_check_runners, setup]
+    needs: setup
    steps:
      - name: Echo folder ${{ matrix.folders }}
        shell: bash
@@ -136,7 +149,7 @@ jobs:
    container:
      image: huggingface/transformers-all-latest-torch-nightly-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: [run_check_runners, setup]
+    needs: setup
    steps:
      - name: Echo folder ${{ matrix.folders }}
        shell: bash
@@ -185,7 +198,7 @@ jobs:
      matrix:
        machine_type: [single-gpu, multi-gpu]
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
-    needs: [run_check_runners, setup]
+    needs: setup
    container:
      image: huggingface/transformers-pytorch-deepspeed-nightly-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -236,13 +249,21 @@ jobs:
    name: Send results to webhook
    runs-on: ubuntu-latest
    if: always()
-    needs: [run_check_runners, setup, run_tests_single_gpu, run_tests_multi_gpu, run_all_tests_torch_cuda_extensions_gpu]
+    needs: [
+      check_runner_status,
+      check_runners,
+      setup,
+      run_tests_single_gpu,
+      run_tests_multi_gpu,
+      run_all_tests_torch_cuda_extensions_gpu
+    ]
    steps:
      - name: Preliminary job status
        shell: bash
        # For the meaning of these environment variables, see the job `Setup`
        run: |
-          echo "Runner status: ${{ needs.run_check_runners.result }}"
+          echo "Runner availability: ${{ needs.check_runner_status.result }}"
+          echo "Runner status: ${{ needs.check_runners.result }}"
          echo "Setup status: ${{ needs.setup.result }}"

      - uses: actions/checkout@v2
@@ -255,8 +276,9 @@ jobs:
          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
          CI_EVENT: nightly-build
+          RUNNER_STATUS: ${{ needs.check_runner_status.result }}
+          RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
          SETUP_STATUS: ${{ needs.setup.result }}
-          RUNNER_STATUS: ${{ needs.run_check_runners.result }}
        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
        run: |
--- a/.github/workflows/self-past.yml
+++ b/.github/workflows/self-past.yml
@@ -27,9 +27,43 @@ env:
  RUN_PT_TF_CROSS_TESTS: 1

 jobs:
+  check_runner_status:
+    name: Check Runner Status
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout transformers
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 2
+
+      - name: Check Runner Status
+        run: python utils/check_self_hosted_runner.py --target_runners single-gpu-past-ci-runner-docker,multi-gpu-past-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+
+  check_runners:
+    name: Check Runners
+    needs: check_runner_status
+    strategy:
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
+    container:
+      image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
  setup:
    name: Setup
-    runs-on: ubuntu-latest
+    needs: check_runners
+    strategy:
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
+    container:
+      image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    outputs:
      matrix: ${{ steps.set-matrix.outputs.matrix }}
    steps:
@@ -50,21 +84,6 @@ jobs:
          cd tests
          echo "::set-output name=matrix::$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')"

-  run_check_runners:
-    name: Check Runners
-    needs: setup
-    strategy:
-      matrix:
-        machine_type: [single-gpu, multi-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
-    container:
-      image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
  run_tests_single_gpu:
    name: Model tests
    strategy:
@@ -76,7 +95,7 @@ jobs:
    container:
      image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: [setup, run_check_runners]
+    needs: setup
    steps:
      - name: Update clone
        working-directory: /transformers
@@ -129,7 +148,7 @@ jobs:
    container:
      image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: [setup, run_check_runners]
+    needs: setup
    steps:
      - name: Update clone
        working-directory: /transformers
@@ -175,13 +194,14 @@ jobs:
    name: Send results to webhook
    runs-on: ubuntu-latest
    if: always()
-    needs: [setup, run_check_runners, run_tests_single_gpu, run_tests_multi_gpu]
+    needs: [check_runner_status, check_runners, setup, run_tests_single_gpu, run_tests_multi_gpu]
    steps:
      - name: Preliminary job status
        shell: bash
        # For the meaning of these environment variables, see the job `Setup`
        run: |
-          echo "Runner status: ${{ needs.run_check_runners.result }}"
+          echo "Runner availability: ${{ needs.check_runner_status.result }}"
+          echo "Runner status: ${{ needs.check_runners.result }}"
          echo "Setup status: ${{ needs.setup.result }}"

      - uses: actions/checkout@v2
@@ -199,8 +219,9 @@ jobs:
          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
          CI_EVENT: Past CI - ${{ inputs.framework }}-${{ inputs.version }}
+          RUNNER_STATUS: ${{ needs.check_runner_status.result }}
+          RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
          SETUP_STATUS: ${{ needs.setup.result }}
-          RUNNER_STATUS: ${{ needs.run_check_runners.result }}
        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
        run: |
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -27,9 +27,43 @@ env:
  RUN_PT_TF_CROSS_TESTS: 1

 jobs:
+  check_runner_status:
+    name: Check Runner Status
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout transformers
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 2
+
+      - name: Check Runner Status
+        run: python utils/check_self_hosted_runner.py --target_runners single-gpu-ci-runner-docker,multi-gpu-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+
+  check_runners:
+    name: Check Runners
+    needs: check_runner_status
+    strategy:
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
+    container:
+      image: huggingface/transformers-all-latest-gpu
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
  setup:
    name: Setup
-    runs-on: ubuntu-latest
+    needs: check_runners
+    strategy:
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
+    container:
+      image: huggingface/transformers-all-latest-gpu
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    outputs:
      matrix: ${{ steps.set-matrix.outputs.matrix }}
      test_map: ${{ steps.set-matrix.outputs.test_map }}
@@ -111,24 +145,9 @@ jobs:
          echo "::set-output name=matrix::$keys"
          echo "::set-output name=test_map::$test_map"

-  run_check_runners:
-    name: Check Runners
-    needs: setup
-    strategy:
-      matrix:
-        machine_type: [single-gpu, multi-gpu]
-    runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
-    container:
-      image: huggingface/transformers-all-latest-gpu
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
  run_tests_single_gpu:
    name: Model tests
-    needs: [setup, run_check_runners]
+    needs: setup
    # `dummy` means there is no test to run
    if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
    strategy:
@@ -213,7 +232,7 @@ jobs:

  run_tests_multi_gpu:
    name: Model tests
-    needs: [setup, run_check_runners]
+    needs: setup
    # `dummy` means there is no test to run
    if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
    strategy:
@@ -300,7 +319,7 @@ jobs:

  run_tests_torch_cuda_extensions_single_gpu:
    name: Torch CUDA extension tests
-    needs: [setup, run_check_runners]
+    needs: setup
    if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
    strategy:
      fail-fast: false
@@ -382,7 +401,7 @@ jobs:

  run_tests_torch_cuda_extensions_multi_gpu:
    name: Torch CUDA extension tests
-    needs: [setup, run_check_runners]
+    needs: setup
    if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
    strategy:
      fail-fast: false
@@ -467,8 +486,9 @@ jobs:
    runs-on: ubuntu-latest
    if: always()
    needs: [
+        check_runner_status,
+        check_runners,
        setup,
-        run_check_runners,
        run_tests_single_gpu,
        run_tests_multi_gpu,
        run_tests_torch_cuda_extensions_single_gpu,
@@ -479,8 +499,9 @@ jobs:
        shell: bash
        # For the meaning of these environment variables, see the job `Setup`
        run: |
+          echo "Runner availability: ${{ needs.check_runner_status.result }}"
          echo "Setup status: ${{ needs.setup.result }}"
-          echo "Runner status: ${{ needs.run_check_runners.result }}"
+          echo "Runner status: ${{ needs.check_runners.result }}"

      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
      # We also take into account the `push` event (we might want to test some changes in a branch)
@@ -527,8 +548,9 @@ jobs:
          CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
          CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
          CI_SHA: ${{ env.CI_SHA }}
+          RUNNER_STATUS: ${{ needs.check_runner_status.result }}
+          RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
          SETUP_STATUS: ${{ needs.setup.result }}
-          RUNNER_STATUS: ${{ needs.run_check_runners.result }}

        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -22,8 +22,21 @@ env:
  RUN_PT_TF_CROSS_TESTS: 1

 jobs:
-  run_check_runners:
+  check_runner_status:
+    name: Check Runner Status
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout transformers
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 2
+
+      - name: Check Runner Status
+        run: python utils/check_self_hosted_runner.py --target_runners single-gpu-scheduled-ci-runner-docker,multi-gpu-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+
+  check_runners:
    name: Check Runners
+    needs: check_runner_status
    strategy:
      matrix:
        machine_type: [single-gpu, multi-gpu]
@@ -38,7 +51,7 @@ jobs:

  setup:
    name: Setup
-    needs: run_check_runners
+    needs: check_runners
    strategy:
      matrix:
        machine_type: [single-gpu, multi-gpu]
@@ -82,7 +95,7 @@ jobs:
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: [run_check_runners, setup]
+    needs: setup
    steps:
      - name: Echo folder ${{ matrix.folders }}
        shell: bash
@@ -135,7 +148,7 @@ jobs:
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: [run_check_runners, setup]
+    needs: setup
    steps:
      - name: Echo folder ${{ matrix.folders }}
        shell: bash
@@ -183,7 +196,7 @@ jobs:
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: [run_check_runners, setup]
+    needs: setup
    steps:
      - name: Update clone
        working-directory: /transformers
@@ -226,7 +239,7 @@ jobs:
    container:
      image: huggingface/transformers-pytorch-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: [run_check_runners, setup]
+    needs: setup
    steps:
      - name: Update clone
        working-directory: /transformers
@@ -270,7 +283,7 @@ jobs:
    container:
      image: huggingface/transformers-tensorflow-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: [run_check_runners, setup]
+    needs: setup
    steps:
      - name: Update clone
        working-directory: /transformers
@@ -312,7 +325,7 @@ jobs:
      matrix:
        machine_type: [single-gpu, multi-gpu]
    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
-    needs: [run_check_runners, setup]
+    needs: setup
    container:
      image: huggingface/transformers-pytorch-deepspeed-latest-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -362,7 +375,8 @@ jobs:
    runs-on: ubuntu-latest
    if: always()
    needs: [
-      run_check_runners,
+      check_runner_status,
+      check_runners,
      setup,
      run_tests_single_gpu,
      run_tests_multi_gpu,
@@ -376,7 +390,8 @@ jobs:
        shell: bash
        # For the meaning of these environment variables, see the job `Setup`
        run: |
-          echo "Runner status: ${{ needs.run_check_runners.result }}"
+          echo "Runner availability: ${{ needs.check_runner_status.result }}"
+          echo "Runner status: ${{ needs.check_runners.result }}"
          echo "Setup status: ${{ needs.setup.result }}"

      - uses: actions/checkout@v2
@@ -389,8 +404,9 @@ jobs:
          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
          CI_EVENT: scheduled
+          RUNNER_STATUS: ${{ needs.check_runner_status.result }}
+          RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
          SETUP_STATUS: ${{ needs.setup.result }}
-          RUNNER_STATUS: ${{ needs.run_check_runners.result }}
        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
        run: |