Tests run on Docker (#10681)
* Tests run on Docker Co-authored-by: Morgan <funtowiczmo@gmail.com> * Comments from code review * Reply to itself * Dependencies Co-authored-by: Morgan <funtowiczmo@gmail.com>
This commit is contained in:
311
.github/workflows/self-scheduled.yml
vendored
311
.github/workflows/self-scheduled.yml
vendored
@@ -1,8 +1,3 @@
|
||||
# configuration notes:
|
||||
#
|
||||
# - `source .env/bin/activate` is currently needed to be run first thing first in each step. Otherwise
|
||||
# the step uses the system-wide python interpreter.
|
||||
|
||||
name: Self-hosted runner (scheduled)
|
||||
|
||||
on:
|
||||
@@ -15,61 +10,39 @@ on:
|
||||
|
||||
jobs:
|
||||
run_all_tests_torch_gpu:
|
||||
runs-on: [self-hosted, gpu, single-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, single-gpu]
|
||||
container:
|
||||
image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Launcher docker
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Loading cache.
|
||||
uses: actions/cache@v2
|
||||
id: cache
|
||||
with:
|
||||
path: .env
|
||||
key: v 1.2-slow_tests_torch_gpu-${{ hashFiles('setup.py') }}
|
||||
|
||||
- name: Python version
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
which python
|
||||
python --version
|
||||
pip --version
|
||||
|
||||
- name: Current dir
|
||||
run: pwd
|
||||
|
||||
- run: nvidia-smi
|
||||
|
||||
- name: Kill any run-away pytest processes
|
||||
run: (pkill -f tests; pkill -f examples) || echo "no zombies"
|
||||
|
||||
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
|
||||
if: steps.cache.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
python -m venv .env
|
||||
source .env/bin/activate
|
||||
which python
|
||||
python --version
|
||||
pip --version
|
||||
nvidia-smi
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
source .env/bin/activate
|
||||
apt -y update && apt install -y libsndfile1-dev
|
||||
pip install --upgrade pip
|
||||
pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
|
||||
pip install git+https://github.com/huggingface/datasets
|
||||
pip list
|
||||
pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
|
||||
|
||||
- name: Are GPUs recognized by our DL frameworks
|
||||
run: |
|
||||
source .env/bin/activate
|
||||
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
|
||||
python -c "import torch; print('Cuda version:', torch.version.cuda)"
|
||||
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
|
||||
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
|
||||
|
||||
- name: Run all tests on GPU
|
||||
env:
|
||||
OMP_NUM_THREADS: 1
|
||||
OMP_NUM_THREADS: 16
|
||||
MKL_NUM_THREADS: 16
|
||||
RUN_SLOW: yes
|
||||
HF_HOME: /mnt/cache
|
||||
run: |
|
||||
source .env/bin/activate
|
||||
python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_gpu tests
|
||||
python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_gpu tests
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
@@ -78,12 +51,13 @@ jobs:
|
||||
- name: Run examples tests on GPU
|
||||
if: ${{ always() }}
|
||||
env:
|
||||
OMP_NUM_THREADS: 1
|
||||
OMP_NUM_THREADS: 16
|
||||
MKL_NUM_THREADS: 16
|
||||
RUN_SLOW: yes
|
||||
HF_HOME: /mnt/cache
|
||||
run: |
|
||||
source .env/bin/activate
|
||||
pip install -r examples/_tests_requirements.txt
|
||||
python -m pytest -n 1 --dist=loadfile -s --make-reports=examples_torch_gpu examples
|
||||
python -m pytest -n 1 --dist=loadfile --make-reports=examples_torch_gpu examples
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
@@ -92,13 +66,13 @@ jobs:
|
||||
- name: Run all pipeline tests on GPU
|
||||
if: ${{ always() }}
|
||||
env:
|
||||
TF_FORCE_GPU_ALLOW_GROWTH: "true"
|
||||
OMP_NUM_THREADS: 1
|
||||
OMP_NUM_THREADS: 16
|
||||
MKL_NUM_THREADS: 16
|
||||
RUN_SLOW: yes
|
||||
RUN_PIPELINE_TESTS: yes
|
||||
HF_HOME: /mnt/cache
|
||||
run: |
|
||||
source .env/bin/activate
|
||||
python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
|
||||
python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
@@ -111,64 +85,39 @@ jobs:
|
||||
name: run_all_tests_torch_gpu_test_reports
|
||||
path: reports
|
||||
|
||||
|
||||
run_all_tests_tf_gpu:
|
||||
runs-on: [self-hosted, gpu, single-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, single-gpu]
|
||||
container:
|
||||
image: tensorflow/tensorflow:2.4.1-gpu
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Launcher docker
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Loading cache.
|
||||
uses: actions/cache@v2
|
||||
id: cache
|
||||
with:
|
||||
path: .env
|
||||
key: v1.2-slow_tests_tf_gpu-${{ hashFiles('setup.py') }}
|
||||
|
||||
- name: Python version
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
which python
|
||||
python --version
|
||||
pip --version
|
||||
|
||||
- name: Current dir
|
||||
run: pwd
|
||||
|
||||
- run: nvidia-smi
|
||||
|
||||
- name: Kill any run-away pytest processes
|
||||
run: (pkill -f tests; pkill -f examples) || echo "no zombies"
|
||||
|
||||
|
||||
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
|
||||
if: steps.cache.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
python -m venv .env
|
||||
source .env/bin/activate
|
||||
which python
|
||||
python --version
|
||||
pip --version
|
||||
nvidia-smi
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
source .env/bin/activate
|
||||
pip install --upgrade pip
|
||||
pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
|
||||
pip install git+https://github.com/huggingface/datasets
|
||||
pip list
|
||||
pip install .[sklearn,testing,onnx,sentencepiece]
|
||||
|
||||
- name: Are GPUs recognized by our DL frameworks
|
||||
run: |
|
||||
source .env/bin/activate
|
||||
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
|
||||
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
|
||||
|
||||
- name: Run all tests on GPU
|
||||
env:
|
||||
OMP_NUM_THREADS: 1
|
||||
RUN_SLOW: yes
|
||||
HF_HOME: /mnt/cache
|
||||
OMP_NUM_THREADS: 16
|
||||
TF_NUM_INTEROP_THREADS: 1
|
||||
TF_NUM_INTRAOP_THREADS: 16
|
||||
MKL_NUM_THREADS: 16
|
||||
run: |
|
||||
source .env/bin/activate
|
||||
python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_gpu tests
|
||||
python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu tests
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
@@ -177,17 +126,19 @@ jobs:
|
||||
- name: Run all pipeline tests on GPU
|
||||
if: ${{ always() }}
|
||||
env:
|
||||
TF_FORCE_GPU_ALLOW_GROWTH: "true"
|
||||
OMP_NUM_THREADS: 1
|
||||
RUN_SLOW: yes
|
||||
HF_HOME: /mnt/cache
|
||||
OMP_NUM_THREADS: 16
|
||||
RUN_PIPELINE_TESTS: yes
|
||||
TF_NUM_INTEROP_THREADS: 1
|
||||
TF_NUM_INTRAOP_THREADS: 16
|
||||
MKL_NUM_THREADS: 16
|
||||
run: |
|
||||
source .env/bin/activate
|
||||
python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipelines_gpu tests
|
||||
python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_gpu tests
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
run: cat reports/tests_tf_pipelines_gpu_failures_short.txt
|
||||
run: cat reports/tests_tf_pipeline_gpu_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
@@ -197,92 +148,55 @@ jobs:
|
||||
path: reports
|
||||
|
||||
run_all_tests_torch_multi_gpu:
|
||||
runs-on: [self-hosted, gpu, multi-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, multi-gpu]
|
||||
container:
|
||||
image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Launcher docker
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Loading cache.
|
||||
uses: actions/cache@v2
|
||||
id: cache
|
||||
with:
|
||||
path: .env
|
||||
key: v1.2-slow_tests_torch_multi_gpu-${{ hashFiles('setup.py') }}
|
||||
|
||||
- name: Python version
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
which python
|
||||
python --version
|
||||
pip --version
|
||||
|
||||
- name: Current dir
|
||||
run: pwd
|
||||
|
||||
- run: nvidia-smi
|
||||
|
||||
- name: Kill any run-away pytest processes
|
||||
run: (pkill -f tests; pkill -f examples) || echo "no zombies"
|
||||
|
||||
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
|
||||
if: steps.cache.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
python -m venv .env
|
||||
source .env/bin/activate
|
||||
which python
|
||||
python --version
|
||||
pip --version
|
||||
nvidia-smi
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
source .env/bin/activate
|
||||
apt -y update && apt install -y libsndfile1-dev
|
||||
pip install --upgrade pip
|
||||
pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
|
||||
pip install git+https://github.com/huggingface/datasets
|
||||
pip install fairscale
|
||||
pip install deepspeed
|
||||
pip list
|
||||
pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
|
||||
|
||||
- name: Are GPUs recognized by our DL frameworks
|
||||
run: |
|
||||
source .env/bin/activate
|
||||
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
|
||||
python -c "import torch; print('Cuda version:', torch.version.cuda)"
|
||||
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
|
||||
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
|
||||
|
||||
- name: Run all tests on multi-GPU
|
||||
- name: Run all tests on GPU
|
||||
env:
|
||||
OMP_NUM_THREADS: 1
|
||||
RUN_SLOW: yes
|
||||
HF_HOME: /mnt/cache
|
||||
OMP_NUM_THREADS: 16
|
||||
MKL_NUM_THREADS: 16
|
||||
MKL_SERVICE_FORCE_INTEL: 1
|
||||
run: |
|
||||
source .env/bin/activate
|
||||
python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_multi_gpu tests
|
||||
python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_multi_gpu tests
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
run: cat reports/tests_torch_multi_gpu_failures_short.txt
|
||||
|
||||
- name: Run examples tests on multi-GPU
|
||||
- name: Run all pipeline tests on GPU
|
||||
if: ${{ always() }}
|
||||
env:
|
||||
OMP_NUM_THREADS: 1
|
||||
RUN_SLOW: yes
|
||||
run: |
|
||||
source .env/bin/activate
|
||||
pip install -r examples/_tests_requirements.txt
|
||||
python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_examples_multi_gpu examples
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
run: cat reports/tests_torch_examples_multi_gpu_failures_short.txt
|
||||
|
||||
- name: Run all pipeline tests on multi-GPU
|
||||
if: ${{ always() }}
|
||||
env:
|
||||
TF_FORCE_GPU_ALLOW_GROWTH: "true"
|
||||
OMP_NUM_THREADS: 1
|
||||
OMP_NUM_THREADS: 16
|
||||
MKL_NUM_THREADS: 16
|
||||
RUN_SLOW: yes
|
||||
RUN_PIPELINE_TESTS: yes
|
||||
HF_HOME: /mnt/cache
|
||||
run: |
|
||||
source .env/bin/activate
|
||||
python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
|
||||
python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
@@ -296,76 +210,55 @@ jobs:
|
||||
path: reports
|
||||
|
||||
run_all_tests_tf_multi_gpu:
|
||||
runs-on: [self-hosted, gpu, multi-gpu]
|
||||
runs-on: [self-hosted, docker-gpu, multi-gpu]
|
||||
container:
|
||||
image: tensorflow/tensorflow:2.4.1-gpu
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Launcher docker
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Loading cache.
|
||||
uses: actions/cache@v2
|
||||
id: cache
|
||||
with:
|
||||
path: .env
|
||||
key: v1.2-slow_tests_tf_multi_gpu-${{ hashFiles('setup.py') }}
|
||||
|
||||
- name: Python version
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
which python
|
||||
python --version
|
||||
pip --version
|
||||
|
||||
- name: Current dir
|
||||
run: pwd
|
||||
|
||||
- run: nvidia-smi
|
||||
|
||||
- name: Kill any run-away pytest processes
|
||||
run: (pkill -f tests; pkill -f examples) || echo "no zombies"
|
||||
|
||||
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
|
||||
if: steps.cache.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
python -m venv .env
|
||||
source .env/bin/activate
|
||||
which python
|
||||
python --version
|
||||
pip --version
|
||||
nvidia-smi
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
source .env/bin/activate
|
||||
pip install --upgrade pip
|
||||
pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
|
||||
pip install git+https://github.com/huggingface/datasets
|
||||
pip list
|
||||
pip install .[sklearn,testing,onnx,sentencepiece]
|
||||
|
||||
- name: Are GPUs recognized by our DL frameworks
|
||||
run: |
|
||||
source .env/bin/activate
|
||||
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
|
||||
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
|
||||
|
||||
- name: Run all tests on multi-GPU
|
||||
- name: Run all tests on GPU
|
||||
env:
|
||||
OMP_NUM_THREADS: 1
|
||||
OMP_NUM_THREADS: 16
|
||||
RUN_SLOW: yes
|
||||
MKL_NUM_THREADS: 16
|
||||
TF_NUM_INTEROP_THREADS: 1
|
||||
TF_NUM_INTRAOP_THREADS: 16
|
||||
HF_HOME: /mnt/cache
|
||||
run: |
|
||||
source .env/bin/activate
|
||||
python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_multi_gpu tests
|
||||
python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu tests
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
run: cat reports/tests_tf_multi_gpu_failures_short.txt
|
||||
|
||||
- name: Run all pipeline tests on multi-GPU
|
||||
- name: Run all pipeline tests on GPU
|
||||
if: ${{ always() }}
|
||||
env:
|
||||
TF_FORCE_GPU_ALLOW_GROWTH: "true"
|
||||
OMP_NUM_THREADS: 1
|
||||
OMP_NUM_THREADS: 16
|
||||
RUN_SLOW: yes
|
||||
RUN_PIPELINE_TESTS: yes
|
||||
MKL_NUM_THREADS: 16
|
||||
TF_NUM_INTEROP_THREADS: 1
|
||||
TF_NUM_INTRAOP_THREADS: 16
|
||||
HF_HOME: /mnt/cache
|
||||
run: |
|
||||
source .env/bin/activate
|
||||
python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests
|
||||
python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
@@ -377,3 +270,23 @@ jobs:
|
||||
with:
|
||||
name: run_all_tests_tf_multi_gpu_test_reports
|
||||
path: reports
|
||||
|
||||
send_results:
|
||||
name: Send results to webhook
|
||||
runs-on: ubuntu-latest
|
||||
if: always()
|
||||
needs: [run_all_tests_torch_gpu, run_all_tests_tf_gpu, run_all_tests_torch_multi_gpu, run_all_tests_tf_multi_gpu]
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- uses: actions/download-artifact@v2
|
||||
|
||||
- name: Send message to Slack
|
||||
env:
|
||||
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
|
||||
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
|
||||
|
||||
|
||||
run: |
|
||||
pip install slack_sdk
|
||||
python utils/notification_service.py scheduled
|
||||
|
||||
Reference in New Issue
Block a user