Add runner availability check (#19054)
Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
38
.github/workflows/self-scheduled.yml
vendored
38
.github/workflows/self-scheduled.yml
vendored
@@ -22,8 +22,21 @@ env:
|
||||
RUN_PT_TF_CROSS_TESTS: 1
|
||||
|
||||
jobs:
|
||||
run_check_runners:
|
||||
check_runner_status:
|
||||
name: Check Runner Status
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout transformers
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Check Runner Status
|
||||
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-scheduled-ci-runner-docker,multi-gpu-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
|
||||
check_runners:
|
||||
name: Check Runners
|
||||
needs: check_runner_status
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
@@ -38,7 +51,7 @@ jobs:
|
||||
|
||||
setup:
|
||||
name: Setup
|
||||
needs: run_check_runners
|
||||
needs: check_runners
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
@@ -82,7 +95,7 @@ jobs:
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: [run_check_runners, setup]
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
@@ -135,7 +148,7 @@ jobs:
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: [run_check_runners, setup]
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
@@ -183,7 +196,7 @@ jobs:
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: [run_check_runners, setup]
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
@@ -226,7 +239,7 @@ jobs:
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: [run_check_runners, setup]
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
@@ -270,7 +283,7 @@ jobs:
|
||||
container:
|
||||
image: huggingface/transformers-tensorflow-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: [run_check_runners, setup]
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
@@ -312,7 +325,7 @@ jobs:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
|
||||
needs: [run_check_runners, setup]
|
||||
needs: setup
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-deepspeed-latest-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@@ -362,7 +375,8 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
if: always()
|
||||
needs: [
|
||||
run_check_runners,
|
||||
check_runner_status,
|
||||
check_runners,
|
||||
setup,
|
||||
run_tests_single_gpu,
|
||||
run_tests_multi_gpu,
|
||||
@@ -376,7 +390,8 @@ jobs:
|
||||
shell: bash
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
run: |
|
||||
echo "Runner status: ${{ needs.run_check_runners.result }}"
|
||||
echo "Runner availability: ${{ needs.check_runner_status.result }}"
|
||||
echo "Runner status: ${{ needs.check_runners.result }}"
|
||||
echo "Setup status: ${{ needs.setup.result }}"
|
||||
|
||||
- uses: actions/checkout@v2
|
||||
@@ -389,8 +404,9 @@ jobs:
|
||||
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
|
||||
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
|
||||
CI_EVENT: scheduled
|
||||
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
|
||||
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
|
||||
SETUP_STATUS: ${{ needs.setup.result }}
|
||||
RUNNER_STATUS: ${{ needs.run_check_runners.result }}
|
||||
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
|
||||
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
|
||||
run: |
|
||||
|
||||
Reference in New Issue
Block a user