diff --git a/.github/workflows/check_runner_status.yml b/.github/workflows/check_runner_status.yml index 2b61bfd971..9bc1616537 100644 --- a/.github/workflows/check_runner_status.yml +++ b/.github/workflows/check_runner_status.yml @@ -19,6 +19,8 @@ jobs: check_runner_status: name: Check Runner Status runs-on: ubuntu-latest + outputs: + offline_runners: ${{ steps.set-offline_runners.outputs.offline_runners }} steps: - name: Checkout transformers uses: actions/checkout@v2 @@ -26,7 +28,14 @@ jobs: fetch-depth: 2 - name: Check Runner Status - run: python utils/check_self_hosted_runner.py --target_runners single-gpu-ci-runner-docker,multi-gpu-ci-runner-docker,single-gpu-scheduled-ci-runner-docker,multi-scheduled-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} + run: python utils/check_self_hosted_runner.py --target_runners single-gpu-ci-runner-docker,multi-gpu-ci-runner-docker,single-gpu-scheduled-ci-runner-docker,multi-scheduled-scheduled-ci-runner-docker,single-gpu-doctest-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} + + - id: set-offline_runners + name: Set output for offline runners + if: ${{ always() }} + run: | + offline_runners=$(python3 -c 'fp = open("offline_runners.txt"); failed = fp.read(); fp.close(); print(failed)') + echo "::set-output name=offline_runners::$offline_runners" send_results: name: Send results to webhook @@ -50,6 +59,7 @@ jobs: CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} CI_EVENT: runner status check RUNNER_STATUS: ${{ needs.check_runner_status.result }} + OFFLINE_RUNNERS: ${{ needs.check_runner_status.outputs.offline_runners }} # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. run: | diff --git a/utils/check_self_hosted_runner.py b/utils/check_self_hosted_runner.py index 47049d9925..f7303366ea 100644 --- a/utils/check_self_hosted_runner.py +++ b/utils/check_self_hosted_runner.py @@ -5,6 +5,8 @@ import subprocess def get_runner_status(target_runners, token): + offline_runners = [] + cmd = ( f'curl -H "Accept: application/vnd.github+json" -H "Authorization: Bearer {token}"' " https://api.github.com/repos/huggingface/transformers/actions/runners" @@ -17,7 +19,15 @@ def get_runner_status(target_runners, token): for runner in runners: if runner["name"] in target_runners: if runner["status"] == "offline": - raise ValueError(f"{runner['name']} is offline!") + offline_runners.append(runner) + + # save the result so we can report them on Slack + with open("offline_runners.txt", "w") as fp: + fp.write(json.dumps(offline_runners)) + + if len(offline_runners) > 0: + failed = "\n".join(offline_runners) + raise ValueError(f"The following runners are offline:\n{failed}") if __name__ == "__main__": diff --git a/utils/notification_service.py b/utils/notification_service.py index 5b5fdd46f1..d4b5479aec 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -397,8 +397,12 @@ class Message: ci_title_block = {"type": "section", "text": {"type": "mrkdwn", "text": ci_title}} blocks.append(ci_title_block) + offline_runners = [] if runner_not_available: text = "💔 CI runners are not available! Tests are not run. 😭" + result = os.environ.get("OFFLINE_RUNNERS") + if result is not None: + offline_runners = json.loads(result) elif runner_failed: text = "💔 CI runners have problems! Tests are not run. 😭" elif setup_failed: @@ -413,11 +417,18 @@ class Message: "text": text, }, } + + text = "" + if len(offline_runners) > 0: + text = "\n • " + "\n • ".join(offline_runners) + text = f"The following runners are offline:\n{text}\n\n" + text += "🙏 Let's fix it ASAP! 🙏" + error_block_2 = { "type": "section", "text": { "type": "plain_text", - "text": "🙏 Let's fix it ASAP! 🙏", + "text": text, }, "accessory": { "type": "button",