Add runner availability check (#19054)
Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
57
.github/workflows/check_runner_status.yml
vendored
Normal file
57
.github/workflows/check_runner_status.yml
vendored
Normal file
@@ -0,0 +1,57 @@
|
||||
name: Self-hosted runner (check runner status)
|
||||
|
||||
# Note that each job's dependencies go into a corresponding docker file.
|
||||
#
|
||||
# For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is
|
||||
# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at
|
||||
# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`
|
||||
|
||||
on:
|
||||
repository_dispatch:
|
||||
schedule:
|
||||
# run per hour
|
||||
- cron: "* */1 * * *"
|
||||
|
||||
env:
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
|
||||
jobs:
|
||||
check_runner_status:
|
||||
name: Check Runner Status
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout transformers
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Check Runner Status
|
||||
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-ci-runner-docker,multi-gpu-ci-runner-docker,single-gpu-scheduled-ci-runner-docker,multi-scheduled-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
|
||||
send_results:
|
||||
name: Send results to webhook
|
||||
runs-on: ubuntu-latest
|
||||
needs: check_runner_status
|
||||
if: ${{ failure() }}
|
||||
steps:
|
||||
- name: Preliminary job status
|
||||
shell: bash
|
||||
run: |
|
||||
echo "Runner availability: ${{ needs.check_runner_status.result }}"
|
||||
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/download-artifact@v2
|
||||
- name: Send message to Slack
|
||||
env:
|
||||
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
|
||||
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
|
||||
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
|
||||
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
|
||||
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
|
||||
CI_EVENT: runner status check
|
||||
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
|
||||
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
|
||||
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
|
||||
run: |
|
||||
pip install slack_sdk
|
||||
python utils/notification_service.py
|
||||
38
.github/workflows/self-nightly-scheduled.yml
vendored
38
.github/workflows/self-nightly-scheduled.yml
vendored
@@ -23,8 +23,21 @@ env:
|
||||
RUN_PT_TF_CROSS_TESTS: 1
|
||||
|
||||
jobs:
|
||||
run_check_runners:
|
||||
check_runner_status:
|
||||
name: Check Runner Status
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout transformers
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Check Runner Status
|
||||
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-scheduled-ci-runner-docker,multi-gpu-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
|
||||
check_runners:
|
||||
name: Check Runners
|
||||
needs: check_runner_status
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
@@ -39,7 +52,7 @@ jobs:
|
||||
|
||||
setup:
|
||||
name: Setup
|
||||
needs: run_check_runners
|
||||
needs: check_runners
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
@@ -83,7 +96,7 @@ jobs:
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-torch-nightly-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: [run_check_runners, setup]
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
@@ -136,7 +149,7 @@ jobs:
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-torch-nightly-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: [run_check_runners, setup]
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
@@ -185,7 +198,7 @@ jobs:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
|
||||
needs: [run_check_runners, setup]
|
||||
needs: setup
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-deepspeed-nightly-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@@ -236,13 +249,21 @@ jobs:
|
||||
name: Send results to webhook
|
||||
runs-on: ubuntu-latest
|
||||
if: always()
|
||||
needs: [run_check_runners, setup, run_tests_single_gpu, run_tests_multi_gpu, run_all_tests_torch_cuda_extensions_gpu]
|
||||
needs: [
|
||||
check_runner_status,
|
||||
check_runners,
|
||||
setup,
|
||||
run_tests_single_gpu,
|
||||
run_tests_multi_gpu,
|
||||
run_all_tests_torch_cuda_extensions_gpu
|
||||
]
|
||||
steps:
|
||||
- name: Preliminary job status
|
||||
shell: bash
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
run: |
|
||||
echo "Runner status: ${{ needs.run_check_runners.result }}"
|
||||
echo "Runner availability: ${{ needs.check_runner_status.result }}"
|
||||
echo "Runner status: ${{ needs.check_runners.result }}"
|
||||
echo "Setup status: ${{ needs.setup.result }}"
|
||||
|
||||
- uses: actions/checkout@v2
|
||||
@@ -255,8 +276,9 @@ jobs:
|
||||
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
|
||||
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
|
||||
CI_EVENT: nightly-build
|
||||
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
|
||||
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
|
||||
SETUP_STATUS: ${{ needs.setup.result }}
|
||||
RUNNER_STATUS: ${{ needs.run_check_runners.result }}
|
||||
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
|
||||
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
|
||||
run: |
|
||||
|
||||
63
.github/workflows/self-past.yml
vendored
63
.github/workflows/self-past.yml
vendored
@@ -27,9 +27,43 @@ env:
|
||||
RUN_PT_TF_CROSS_TESTS: 1
|
||||
|
||||
jobs:
|
||||
check_runner_status:
|
||||
name: Check Runner Status
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout transformers
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Check Runner Status
|
||||
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-past-ci-runner-docker,multi-gpu-past-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
|
||||
check_runners:
|
||||
name: Check Runners
|
||||
needs: check_runner_status
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
|
||||
container:
|
||||
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
setup:
|
||||
name: Setup
|
||||
runs-on: ubuntu-latest
|
||||
needs: check_runners
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
|
||||
container:
|
||||
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
outputs:
|
||||
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||
steps:
|
||||
@@ -50,21 +84,6 @@ jobs:
|
||||
cd tests
|
||||
echo "::set-output name=matrix::$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')"
|
||||
|
||||
run_check_runners:
|
||||
name: Check Runners
|
||||
needs: setup
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
|
||||
container:
|
||||
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
run_tests_single_gpu:
|
||||
name: Model tests
|
||||
strategy:
|
||||
@@ -76,7 +95,7 @@ jobs:
|
||||
container:
|
||||
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: [setup, run_check_runners]
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
@@ -129,7 +148,7 @@ jobs:
|
||||
container:
|
||||
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: [setup, run_check_runners]
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
@@ -175,13 +194,14 @@ jobs:
|
||||
name: Send results to webhook
|
||||
runs-on: ubuntu-latest
|
||||
if: always()
|
||||
needs: [setup, run_check_runners, run_tests_single_gpu, run_tests_multi_gpu]
|
||||
needs: [check_runner_status, check_runners, setup, run_tests_single_gpu, run_tests_multi_gpu]
|
||||
steps:
|
||||
- name: Preliminary job status
|
||||
shell: bash
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
run: |
|
||||
echo "Runner status: ${{ needs.run_check_runners.result }}"
|
||||
echo "Runner availability: ${{ needs.check_runner_status.result }}"
|
||||
echo "Runner status: ${{ needs.check_runners.result }}"
|
||||
echo "Setup status: ${{ needs.setup.result }}"
|
||||
|
||||
- uses: actions/checkout@v2
|
||||
@@ -199,8 +219,9 @@ jobs:
|
||||
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
|
||||
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
|
||||
CI_EVENT: Past CI - ${{ inputs.framework }}-${{ inputs.version }}
|
||||
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
|
||||
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
|
||||
SETUP_STATUS: ${{ needs.setup.result }}
|
||||
RUNNER_STATUS: ${{ needs.run_check_runners.result }}
|
||||
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
|
||||
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
|
||||
run: |
|
||||
|
||||
68
.github/workflows/self-push.yml
vendored
68
.github/workflows/self-push.yml
vendored
@@ -27,9 +27,43 @@ env:
|
||||
RUN_PT_TF_CROSS_TESTS: 1
|
||||
|
||||
jobs:
|
||||
check_runner_status:
|
||||
name: Check Runner Status
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout transformers
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Check Runner Status
|
||||
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-ci-runner-docker,multi-gpu-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
|
||||
check_runners:
|
||||
name: Check Runners
|
||||
needs: check_runner_status
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
setup:
|
||||
name: Setup
|
||||
runs-on: ubuntu-latest
|
||||
needs: check_runners
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
outputs:
|
||||
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||
test_map: ${{ steps.set-matrix.outputs.test_map }}
|
||||
@@ -111,24 +145,9 @@ jobs:
|
||||
echo "::set-output name=matrix::$keys"
|
||||
echo "::set-output name=test_map::$test_map"
|
||||
|
||||
run_check_runners:
|
||||
name: Check Runners
|
||||
needs: setup
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
run_tests_single_gpu:
|
||||
name: Model tests
|
||||
needs: [setup, run_check_runners]
|
||||
needs: setup
|
||||
# `dummy` means there is no test to run
|
||||
if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
|
||||
strategy:
|
||||
@@ -213,7 +232,7 @@ jobs:
|
||||
|
||||
run_tests_multi_gpu:
|
||||
name: Model tests
|
||||
needs: [setup, run_check_runners]
|
||||
needs: setup
|
||||
# `dummy` means there is no test to run
|
||||
if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
|
||||
strategy:
|
||||
@@ -300,7 +319,7 @@ jobs:
|
||||
|
||||
run_tests_torch_cuda_extensions_single_gpu:
|
||||
name: Torch CUDA extension tests
|
||||
needs: [setup, run_check_runners]
|
||||
needs: setup
|
||||
if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -382,7 +401,7 @@ jobs:
|
||||
|
||||
run_tests_torch_cuda_extensions_multi_gpu:
|
||||
name: Torch CUDA extension tests
|
||||
needs: [setup, run_check_runners]
|
||||
needs: setup
|
||||
if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -467,8 +486,9 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
if: always()
|
||||
needs: [
|
||||
check_runner_status,
|
||||
check_runners,
|
||||
setup,
|
||||
run_check_runners,
|
||||
run_tests_single_gpu,
|
||||
run_tests_multi_gpu,
|
||||
run_tests_torch_cuda_extensions_single_gpu,
|
||||
@@ -479,8 +499,9 @@ jobs:
|
||||
shell: bash
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
run: |
|
||||
echo "Runner availability: ${{ needs.check_runner_status.result }}"
|
||||
echo "Setup status: ${{ needs.setup.result }}"
|
||||
echo "Runner status: ${{ needs.run_check_runners.result }}"
|
||||
echo "Runner status: ${{ needs.check_runners.result }}"
|
||||
|
||||
# Necessary to get the correct branch name and commit SHA for `workflow_run` event
|
||||
# We also take into account the `push` event (we might want to test some changes in a branch)
|
||||
@@ -527,8 +548,9 @@ jobs:
|
||||
CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
|
||||
CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
|
||||
CI_SHA: ${{ env.CI_SHA }}
|
||||
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
|
||||
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
|
||||
SETUP_STATUS: ${{ needs.setup.result }}
|
||||
RUNNER_STATUS: ${{ needs.run_check_runners.result }}
|
||||
|
||||
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
|
||||
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
|
||||
|
||||
38
.github/workflows/self-scheduled.yml
vendored
38
.github/workflows/self-scheduled.yml
vendored
@@ -22,8 +22,21 @@ env:
|
||||
RUN_PT_TF_CROSS_TESTS: 1
|
||||
|
||||
jobs:
|
||||
run_check_runners:
|
||||
check_runner_status:
|
||||
name: Check Runner Status
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout transformers
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Check Runner Status
|
||||
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-scheduled-ci-runner-docker,multi-gpu-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
|
||||
check_runners:
|
||||
name: Check Runners
|
||||
needs: check_runner_status
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
@@ -38,7 +51,7 @@ jobs:
|
||||
|
||||
setup:
|
||||
name: Setup
|
||||
needs: run_check_runners
|
||||
needs: check_runners
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
@@ -82,7 +95,7 @@ jobs:
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: [run_check_runners, setup]
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
@@ -135,7 +148,7 @@ jobs:
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: [run_check_runners, setup]
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Echo folder ${{ matrix.folders }}
|
||||
shell: bash
|
||||
@@ -183,7 +196,7 @@ jobs:
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: [run_check_runners, setup]
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
@@ -226,7 +239,7 @@ jobs:
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: [run_check_runners, setup]
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
@@ -270,7 +283,7 @@ jobs:
|
||||
container:
|
||||
image: huggingface/transformers-tensorflow-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
needs: [run_check_runners, setup]
|
||||
needs: setup
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
@@ -312,7 +325,7 @@ jobs:
|
||||
matrix:
|
||||
machine_type: [single-gpu, multi-gpu]
|
||||
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
|
||||
needs: [run_check_runners, setup]
|
||||
needs: setup
|
||||
container:
|
||||
image: huggingface/transformers-pytorch-deepspeed-latest-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
@@ -362,7 +375,8 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
if: always()
|
||||
needs: [
|
||||
run_check_runners,
|
||||
check_runner_status,
|
||||
check_runners,
|
||||
setup,
|
||||
run_tests_single_gpu,
|
||||
run_tests_multi_gpu,
|
||||
@@ -376,7 +390,8 @@ jobs:
|
||||
shell: bash
|
||||
# For the meaning of these environment variables, see the job `Setup`
|
||||
run: |
|
||||
echo "Runner status: ${{ needs.run_check_runners.result }}"
|
||||
echo "Runner availability: ${{ needs.check_runner_status.result }}"
|
||||
echo "Runner status: ${{ needs.check_runners.result }}"
|
||||
echo "Setup status: ${{ needs.setup.result }}"
|
||||
|
||||
- uses: actions/checkout@v2
|
||||
@@ -389,8 +404,9 @@ jobs:
|
||||
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
|
||||
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
|
||||
CI_EVENT: scheduled
|
||||
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
|
||||
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
|
||||
SETUP_STATUS: ${{ needs.setup.result }}
|
||||
RUNNER_STATUS: ${{ needs.run_check_runners.result }}
|
||||
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
|
||||
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
|
||||
run: |
|
||||
|
||||
Reference in New Issue
Block a user