Add runner availability check (#19054)
Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
57
.github/workflows/check_runner_status.yml
vendored
Normal file
57
.github/workflows/check_runner_status.yml
vendored
Normal file
@@ -0,0 +1,57 @@
|
||||
name: Self-hosted runner (check runner status)
|
||||
|
||||
# Note that each job's dependencies go into a corresponding docker file.
|
||||
#
|
||||
# For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is
|
||||
# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at
|
||||
# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`
|
||||
|
||||
on:
|
||||
repository_dispatch:
|
||||
schedule:
|
||||
# run per hour
|
||||
- cron: "* */1 * * *"
|
||||
|
||||
env:
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
|
||||
jobs:
|
||||
check_runner_status:
|
||||
name: Check Runner Status
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout transformers
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Check Runner Status
|
||||
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-ci-runner-docker,multi-gpu-ci-runner-docker,single-gpu-scheduled-ci-runner-docker,multi-scheduled-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
|
||||
send_results:
|
||||
name: Send results to webhook
|
||||
runs-on: ubuntu-latest
|
||||
needs: check_runner_status
|
||||
if: ${{ failure() }}
|
||||
steps:
|
||||
- name: Preliminary job status
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Runner availability: ${{ needs.check_runner_status.result }}"
|
||||
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/download-artifact@v2
|
||||
- name: Send message to Slack
|
||||
env:
|
||||
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
|
||||
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
|
||||
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
|
||||
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
|
||||
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
|
||||
CI_EVENT: runner status check
|
||||
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
|
||||
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
|
||||
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
|
||||
run: |
|
||||
pip install slack_sdk
|
||||
python utils/notification_service.py
|
||||
38
.github/workflows/self-nightly-scheduled.yml
vendored
38
.github/workflows/self-nightly-scheduled.yml
vendored
@@ -23,8 +23,21 @@ env:
|
||||
RUN_PT_TF_CROSS_TESTS: 1
|
||||
|
||||
jobs:
|
||||
run_check_runners:
|
||||
check_runner_status:
|
||||
name: Check Runner Status
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout transformers
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Check Runner Status
|
||||
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-scheduled-ci-runner-docker,multi-gpu-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
|
||||
check_runners:
|
||||
name: Check Runners
|
||||
needs: check_runner_status
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
@@ -39,7 +52,7 @@ jobs:
|
||||
|
||||
setup:
|
||||
name: Setup
|
||||
needs: run_check_runners
|
||||
needs: check_runners
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
@@ -83,7 +96,7 @@ jobs:
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-torch-nightly-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: [run_check_runners, setup]
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
@@ -136,7 +149,7 @@ jobs:
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-torch-nightly-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: [run_check_runners, setup]
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
@@ -185,7 +198,7 @@ jobs:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
|
||||
needs: [run_check_runners, setup]
|
||||
needs: setup
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-deepspeed-nightly-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@@ -236,13 +249,21 @@ jobs:
|
||||
name: Send results to webhook
|
||||
runs-on: ubuntu-latest
|
||||
if: always()
|
||||
needs: [run_check_runners, setup, run_tests_single_gpu, run_tests_multi_gpu, run_all_tests_torch_cuda_extensions_gpu]
|
||||
needs: [
|
||||
check_runner_status,
|
||||
check_runners,
|
||||
setup,
|
||||
run_tests_single_gpu,
|
||||
run_tests_multi_gpu,
|
||||
run_all_tests_torch_cuda_extensions_gpu
|
||||
]
|
||||
steps:
|
||||
- name: Preliminary job status
|
||||
shell: bash
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
run: |
|
||||
echo "Runner status: ${{ needs.run_check_runners.result }}"
|
||||
echo "Runner availability: ${{ needs.check_runner_status.result }}"
|
||||
echo "Runner status: ${{ needs.check_runners.result }}"
|
||||
echo "Setup status: ${{ needs.setup.result }}"
|
||||
|
||||
- uses: actions/checkout@v2
|
||||
@@ -255,8 +276,9 @@ jobs:
|
||||
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
|
||||
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
|
||||
CI_EVENT: nightly-build
|
||||
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
|
||||
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
|
||||
SETUP_STATUS: ${{ needs.setup.result }}
|
||||
RUNNER_STATUS: ${{ needs.run_check_runners.result }}
|
||||
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
|
||||
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
|
||||
run: |
|
||||
|
||||
63
.github/workflows/self-past.yml
vendored
63
.github/workflows/self-past.yml
vendored
@@ -27,9 +27,43 @@ env:
|
||||
RUN_PT_TF_CROSS_TESTS: 1
|
||||
|
||||
jobs:
|
||||
check_runner_status:
|
||||
name: Check Runner Status
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout transformers
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Check Runner Status
|
||||
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-past-ci-runner-docker,multi-gpu-past-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
|
||||
check_runners:
|
||||
name: Check Runners
|
||||
needs: check_runner_status
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
|
||||
container:
|
||||
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
setup:
|
||||
name: Setup
|
||||
runs-on: ubuntu-latest
|
||||
needs: check_runners
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
|
||||
container:
|
||||
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
outputs:
|
||||
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||
steps:
|
||||
@@ -50,21 +84,6 @@ jobs:
|
||||
cd tests
|
||||
echo "::set-output name=matrix::$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')"
|
||||
|
||||
run_check_runners:
|
||||
name: Check Runners
|
||||
needs: setup
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
|
||||
container:
|
||||
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
run_tests_single_gpu:
|
||||
name: Model tests
|
||||
strategy:
|
||||
@@ -76,7 +95,7 @@ jobs:
|
||||
container:
|
||||
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: [setup, run_check_runners]
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
@@ -129,7 +148,7 @@ jobs:
|
||||
container:
|
||||
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: [setup, run_check_runners]
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
@@ -175,13 +194,14 @@ jobs:
|
||||
name: Send results to webhook
|
||||
runs-on: ubuntu-latest
|
||||
if: always()
|
||||
needs: [setup, run_check_runners, run_tests_single_gpu, run_tests_multi_gpu]
|
||||
needs: [check_runner_status, check_runners, setup, run_tests_single_gpu, run_tests_multi_gpu]
|
||||
steps:
|
||||
- name: Preliminary job status
|
||||
shell: bash
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
run: |
|
||||
echo "Runner status: ${{ needs.run_check_runners.result }}"
|
||||
echo "Runner availability: ${{ needs.check_runner_status.result }}"
|
||||
echo "Runner status: ${{ needs.check_runners.result }}"
|
||||
echo "Setup status: ${{ needs.setup.result }}"
|
||||
|
||||
- uses: actions/checkout@v2
|
||||
@@ -199,8 +219,9 @@ jobs:
|
||||
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
|
||||
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
|
||||
CI_EVENT: Past CI - ${{ inputs.framework }}-${{ inputs.version }}
|
||||
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
|
||||
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
|
||||
SETUP_STATUS: ${{ needs.setup.result }}
|
||||
RUNNER_STATUS: ${{ needs.run_check_runners.result }}
|
||||
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
|
||||
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
|
||||
run: |
|
||||
|
||||
68
.github/workflows/self-push.yml
vendored
68
.github/workflows/self-push.yml
vendored
@@ -27,9 +27,43 @@ env:
|
||||
RUN_PT_TF_CROSS_TESTS: 1
|
||||
|
||||
jobs:
|
||||
check_runner_status:
|
||||
name: Check Runner Status
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout transformers
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Check Runner Status
|
||||
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-ci-runner-docker,multi-gpu-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
|
||||
check_runners:
|
||||
name: Check Runners
|
||||
needs: check_runner_status
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
setup:
|
||||
name: Setup
|
||||
runs-on: ubuntu-latest
|
||||
needs: check_runners
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
outputs:
|
||||
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||
test_map: ${{ steps.set-matrix.outputs.test_map }}
|
||||
@@ -111,24 +145,9 @@ jobs:
|
||||
echo "::set-output name=matrix::$keys"
|
||||
echo "::set-output name=test_map::$test_map"
|
||||
|
||||
run_check_runners:
|
||||
name: Check Runners
|
||||
needs: setup
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
run_tests_single_gpu:
|
||||
name: Model tests
|
||||
needs: [setup, run_check_runners]
|
||||
needs: setup
|
||||
# `dummy` means there is no test to run
|
||||
if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
|
||||
strategy:
|
||||
@@ -213,7 +232,7 @@ jobs:
|
||||
|
||||
run_tests_multi_gpu:
|
||||
name: Model tests
|
||||
needs: [setup, run_check_runners]
|
||||
needs: setup
|
||||
# `dummy` means there is no test to run
|
||||
if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
|
||||
strategy:
|
||||
@@ -300,7 +319,7 @@ jobs:
|
||||
|
||||
run_tests_torch_cuda_extensions_single_gpu:
|
||||
name: Torch CUDA extension tests
|
||||
needs: [setup, run_check_runners]
|
||||
needs: setup
|
||||
if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -382,7 +401,7 @@ jobs:
|
||||
|
||||
run_tests_torch_cuda_extensions_multi_gpu:
|
||||
name: Torch CUDA extension tests
|
||||
needs: [setup, run_check_runners]
|
||||
needs: setup
|
||||
if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -467,8 +486,9 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
if: always()
|
||||
needs: [
|
||||
check_runner_status,
|
||||
check_runners,
|
||||
setup,
|
||||
run_check_runners,
|
||||
run_tests_single_gpu,
|
||||
run_tests_multi_gpu,
|
||||
run_tests_torch_cuda_extensions_single_gpu,
|
||||
@@ -479,8 +499,9 @@ jobs:
|
||||
shell: bash
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
run: |
|
||||
echo "Runner availability: ${{ needs.check_runner_status.result }}"
|
||||
echo "Setup status: ${{ needs.setup.result }}"
|
||||
echo "Runner status: ${{ needs.run_check_runners.result }}"
|
||||
echo "Runner status: ${{ needs.check_runners.result }}"
|
||||
|
||||
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
|
||||
# We also take into account the `push` event (we might want to test some changes in a branch)
|
||||
@@ -527,8 +548,9 @@ jobs:
|
||||
CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
|
||||
CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
|
||||
CI_SHA: ${{ env.CI_SHA }}
|
||||
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
|
||||
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
|
||||
SETUP_STATUS: ${{ needs.setup.result }}
|
||||
RUNNER_STATUS: ${{ needs.run_check_runners.result }}
|
||||
|
||||
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
|
||||
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
|
||||
|
||||
38
.github/workflows/self-scheduled.yml
vendored
38
.github/workflows/self-scheduled.yml
vendored
@@ -22,8 +22,21 @@ env:
|
||||
RUN_PT_TF_CROSS_TESTS: 1
|
||||
|
||||
jobs:
|
||||
run_check_runners:
|
||||
check_runner_status:
|
||||
name: Check Runner Status
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout transformers
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Check Runner Status
|
||||
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-scheduled-ci-runner-docker,multi-gpu-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
|
||||
check_runners:
|
||||
name: Check Runners
|
||||
needs: check_runner_status
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
@@ -38,7 +51,7 @@ jobs:
|
||||
|
||||
setup:
|
||||
name: Setup
|
||||
needs: run_check_runners
|
||||
needs: check_runners
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
@@ -82,7 +95,7 @@ jobs:
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: [run_check_runners, setup]
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
@@ -135,7 +148,7 @@ jobs:
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: [run_check_runners, setup]
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
@@ -183,7 +196,7 @@ jobs:
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: [run_check_runners, setup]
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
@@ -226,7 +239,7 @@ jobs:
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: [run_check_runners, setup]
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
@@ -270,7 +283,7 @@ jobs:
|
||||
container:
|
||||
image: huggingface/transformers-tensorflow-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: [run_check_runners, setup]
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
@@ -312,7 +325,7 @@ jobs:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
|
||||
needs: [run_check_runners, setup]
|
||||
needs: setup
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-deepspeed-latest-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@@ -362,7 +375,8 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
if: always()
|
||||
needs: [
|
||||
run_check_runners,
|
||||
check_runner_status,
|
||||
check_runners,
|
||||
setup,
|
||||
run_tests_single_gpu,
|
||||
run_tests_multi_gpu,
|
||||
@@ -376,7 +390,8 @@ jobs:
|
||||
shell: bash
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
run: |
|
||||
echo "Runner status: ${{ needs.run_check_runners.result }}"
|
||||
echo "Runner availability: ${{ needs.check_runner_status.result }}"
|
||||
echo "Runner status: ${{ needs.check_runners.result }}"
|
||||
echo "Setup status: ${{ needs.setup.result }}"
|
||||
|
||||
- uses: actions/checkout@v2
|
||||
@@ -389,8 +404,9 @@ jobs:
|
||||
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
|
||||
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
|
||||
CI_EVENT: scheduled
|
||||
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
|
||||
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
|
||||
SETUP_STATUS: ${{ needs.setup.result }}
|
||||
RUNNER_STATUS: ${{ needs.run_check_runners.result }}
|
||||
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
|
||||
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
|
||||
run: |
|
||||
|
||||
43
utils/check_self_hosted_runner.py
Normal file
43
utils/check_self_hosted_runner.py
Normal file
@@ -0,0 +1,43 @@
|
||||
import argparse
|
||||
import json
|
||||
import subprocess
|
||||
|
||||
|
||||
def get_runner_status(target_runners, token):
|
||||
|
||||
cmd = (
|
||||
f'curl -H "Accept: application/vnd.github+json" -H "Authorization: Bearer {token}"'
|
||||
" https://api.github.com/repos/huggingface/transformers/actions/runners"
|
||||
)
|
||||
output = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE)
|
||||
o = output.stdout.decode("utf-8")
|
||||
status = json.loads(o)
|
||||
|
||||
runners = status["runners"]
|
||||
for runner in runners:
|
||||
if runner["name"] in target_runners:
|
||||
if runner["status"] == "offline":
|
||||
raise ValueError(f"{runner['name']} is offline!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
def list_str(values):
|
||||
return values.split(",")
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
# Required parameters
|
||||
parser.add_argument(
|
||||
"--target_runners",
|
||||
default=None,
|
||||
type=list_str,
|
||||
required=True,
|
||||
help="Comma-separated list of runners to check status.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--token", default=None, type=str, required=True, help="A token that has actions:read permission."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
get_runner_status(args.target_runners, args.token)
|
||||
@@ -387,7 +387,7 @@ class Message:
|
||||
return json.dumps(blocks)
|
||||
|
||||
@staticmethod
|
||||
def error_out(title, ci_title="", setup_failed=False, runner_failed=False):
|
||||
def error_out(title, ci_title="", runner_not_available=False, runner_failed=False, setup_failed=False):
|
||||
|
||||
blocks = []
|
||||
title_block = {"type": "header", "text": {"type": "plain_text", "text": title}}
|
||||
@@ -397,10 +397,12 @@ class Message:
|
||||
ci_title_block = {"type": "section", "text": {"type": "mrkdwn", "text": ci_title}}
|
||||
blocks.append(ci_title_block)
|
||||
|
||||
if setup_failed:
|
||||
text = "💔 Setup job failed. Tests are not run. 😭"
|
||||
if runner_not_available:
|
||||
text = "💔 CI runners are not available! Tests are not run. 😭"
|
||||
elif runner_failed:
|
||||
text = "💔 CI runners have problems! Tests are not run. 😭"
|
||||
elif setup_failed:
|
||||
text = "💔 Setup job failed. Tests are not run. 😭"
|
||||
else:
|
||||
text = "💔 There was an issue running the tests. 😭"
|
||||
|
||||
@@ -654,10 +656,13 @@ def prepare_reports(title, header, reports, to_truncate=True):
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
setup_status = os.environ.get("SETUP_STATUS")
|
||||
runner_status = os.environ.get("RUNNER_STATUS")
|
||||
runner_env_status = os.environ.get("RUNNER_ENV_STATUS")
|
||||
setup_status = os.environ.get("SETUP_STATUS")
|
||||
|
||||
runner_not_available = True if runner_status is not None and runner_status != "success" else False
|
||||
runner_failed = True if runner_env_status is not None and runner_env_status != "success" else False
|
||||
setup_failed = True if setup_status is not None and setup_status != "success" else False
|
||||
runner_failed = True if runner_status is not None and runner_status != "success" else False
|
||||
|
||||
org = "huggingface"
|
||||
repo = "transformers"
|
||||
@@ -718,8 +723,8 @@ if __name__ == "__main__":
|
||||
else:
|
||||
ci_title = ""
|
||||
|
||||
if setup_failed or runner_failed:
|
||||
Message.error_out(title, ci_title, setup_failed, runner_failed)
|
||||
if runner_not_available or runner_failed or setup_failed:
|
||||
Message.error_out(title, ci_title, runner_not_available, runner_failed, setup_failed)
|
||||
exit(0)
|
||||
|
||||
arguments = sys.argv[1:][0]
|
||||
@@ -728,7 +733,7 @@ if __name__ == "__main__":
|
||||
# Need to change from elements like `models/bert` to `models_bert` (the ones used as artifact names).
|
||||
models = [x.replace("models/", "models_") for x in models]
|
||||
except SyntaxError:
|
||||
Message.error_out()
|
||||
Message.error_out(title, ci_title)
|
||||
raise ValueError("Errored out.")
|
||||
|
||||
github_actions_job_links = get_job_links()
|
||||
|
||||
Reference in New Issue
Block a user