Add offline runners info in the Slack report (#19169)
* send slack report for offline runners Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
12
.github/workflows/check_runner_status.yml
vendored
12
.github/workflows/check_runner_status.yml
vendored
@@ -19,6 +19,8 @@ jobs:
|
|||||||
check_runner_status:
|
check_runner_status:
|
||||||
name: Check Runner Status
|
name: Check Runner Status
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
outputs:
|
||||||
|
offline_runners: ${{ steps.set-offline_runners.outputs.offline_runners }}
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout transformers
|
- name: Checkout transformers
|
||||||
uses: actions/checkout@v2
|
uses: actions/checkout@v2
|
||||||
@@ -26,7 +28,14 @@ jobs:
|
|||||||
fetch-depth: 2
|
fetch-depth: 2
|
||||||
|
|
||||||
- name: Check Runner Status
|
- name: Check Runner Status
|
||||||
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-ci-runner-docker,multi-gpu-ci-runner-docker,single-gpu-scheduled-ci-runner-docker,multi-scheduled-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-ci-runner-docker,multi-gpu-ci-runner-docker,single-gpu-scheduled-ci-runner-docker,multi-scheduled-scheduled-ci-runner-docker,single-gpu-doctest-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||||
|
|
||||||
|
- id: set-offline_runners
|
||||||
|
name: Set output for offline runners
|
||||||
|
if: ${{ always() }}
|
||||||
|
run: |
|
||||||
|
offline_runners=$(python3 -c 'fp = open("offline_runners.txt"); failed = fp.read(); fp.close(); print(failed)')
|
||||||
|
echo "::set-output name=offline_runners::$offline_runners"
|
||||||
|
|
||||||
send_results:
|
send_results:
|
||||||
name: Send results to webhook
|
name: Send results to webhook
|
||||||
@@ -50,6 +59,7 @@ jobs:
|
|||||||
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
|
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
|
||||||
CI_EVENT: runner status check
|
CI_EVENT: runner status check
|
||||||
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
|
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
|
||||||
|
OFFLINE_RUNNERS: ${{ needs.check_runner_status.outputs.offline_runners }}
|
||||||
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
|
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
|
||||||
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
|
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
@@ -5,6 +5,8 @@ import subprocess
|
|||||||
|
|
||||||
def get_runner_status(target_runners, token):
|
def get_runner_status(target_runners, token):
|
||||||
|
|
||||||
|
offline_runners = []
|
||||||
|
|
||||||
cmd = (
|
cmd = (
|
||||||
f'curl -H "Accept: application/vnd.github+json" -H "Authorization: Bearer {token}"'
|
f'curl -H "Accept: application/vnd.github+json" -H "Authorization: Bearer {token}"'
|
||||||
" https://api.github.com/repos/huggingface/transformers/actions/runners"
|
" https://api.github.com/repos/huggingface/transformers/actions/runners"
|
||||||
@@ -17,7 +19,15 @@ def get_runner_status(target_runners, token):
|
|||||||
for runner in runners:
|
for runner in runners:
|
||||||
if runner["name"] in target_runners:
|
if runner["name"] in target_runners:
|
||||||
if runner["status"] == "offline":
|
if runner["status"] == "offline":
|
||||||
raise ValueError(f"{runner['name']} is offline!")
|
offline_runners.append(runner)
|
||||||
|
|
||||||
|
# save the result so we can report them on Slack
|
||||||
|
with open("offline_runners.txt", "w") as fp:
|
||||||
|
fp.write(json.dumps(offline_runners))
|
||||||
|
|
||||||
|
if len(offline_runners) > 0:
|
||||||
|
failed = "\n".join(offline_runners)
|
||||||
|
raise ValueError(f"The following runners are offline:\n{failed}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -397,8 +397,12 @@ class Message:
|
|||||||
ci_title_block = {"type": "section", "text": {"type": "mrkdwn", "text": ci_title}}
|
ci_title_block = {"type": "section", "text": {"type": "mrkdwn", "text": ci_title}}
|
||||||
blocks.append(ci_title_block)
|
blocks.append(ci_title_block)
|
||||||
|
|
||||||
|
offline_runners = []
|
||||||
if runner_not_available:
|
if runner_not_available:
|
||||||
text = "💔 CI runners are not available! Tests are not run. 😭"
|
text = "💔 CI runners are not available! Tests are not run. 😭"
|
||||||
|
result = os.environ.get("OFFLINE_RUNNERS")
|
||||||
|
if result is not None:
|
||||||
|
offline_runners = json.loads(result)
|
||||||
elif runner_failed:
|
elif runner_failed:
|
||||||
text = "💔 CI runners have problems! Tests are not run. 😭"
|
text = "💔 CI runners have problems! Tests are not run. 😭"
|
||||||
elif setup_failed:
|
elif setup_failed:
|
||||||
@@ -413,11 +417,18 @@ class Message:
|
|||||||
"text": text,
|
"text": text,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
text = ""
|
||||||
|
if len(offline_runners) > 0:
|
||||||
|
text = "\n • " + "\n • ".join(offline_runners)
|
||||||
|
text = f"The following runners are offline:\n{text}\n\n"
|
||||||
|
text += "🙏 Let's fix it ASAP! 🙏"
|
||||||
|
|
||||||
error_block_2 = {
|
error_block_2 = {
|
||||||
"type": "section",
|
"type": "section",
|
||||||
"text": {
|
"text": {
|
||||||
"type": "plain_text",
|
"type": "plain_text",
|
||||||
"text": "🙏 Let's fix it ASAP! 🙏",
|
"text": text,
|
||||||
},
|
},
|
||||||
"accessory": {
|
"accessory": {
|
||||||
"type": "button",
|
"type": "button",
|
||||||
|
|||||||
Reference in New Issue
Block a user