Ci test tf super slow (#8007)

* Test TF GPU CI

* Change cache

* Fix missing torch requirement

* Fix some model tests


Style

* LXMERT

* MobileBERT

* Longformer skip test

* XLNet

* The rest of the tests

* RAG goes OOM in multi gpu setup

* YAML test files

* Last fixes

* Skip doctests

* Fill mask tests

* Yaml files

* Last test fix

* Style

* Update cache

* Change ONNX tests to slow + use tiny model
This commit is contained in:
Lysandre Debut
2020-10-30 14:25:48 +00:00
committed by GitHub
parent 7e36deec7a
commit 10f8c63620
25 changed files with 562 additions and 126 deletions

View File

@@ -13,7 +13,7 @@ on:
jobs:
run_tests_torch_and_tf_gpu:
run_tests_torch_gpu:
runs-on: [self-hosted, single-gpu]
steps:
- uses: actions/checkout@v2
@@ -32,7 +32,7 @@ jobs:
id: cache
with:
path: .env
key: v1-tests_tf_torch_gpu-${{ hashFiles('setup.py') }}
key: v1.1-tests_torch_gpu-${{ hashFiles('setup.py') }}
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
run: |
@@ -46,8 +46,7 @@ jobs:
run: |
source .env/bin/activate
pip install --upgrade pip
pip install torch!=1.6.0
pip install .[sklearn,testing,onnxruntime]
pip install .[torch,sklearn,testing,onnxruntime]
pip install git+https://github.com/huggingface/datasets
- name: Are GPUs recognized by our DL frameworks
@@ -58,16 +57,14 @@ jobs:
- name: Run all non-slow tests on GPU
env:
TF_FORCE_GPU_ALLOW_GROWTH: "true"
# TF_GPU_MEMORY_LIMIT: 4096
OMP_NUM_THREADS: 1
CUDA_VISIBLE_DEVICES: 0
run: |
source .env/bin/activate
python -m pytest -n 2 --dist=loadfile -s tests
python -m pytest -n 2 --dist=loadfile -s ./tests/
run_tests_torch_and_tf_multiple_gpu:
runs-on: [self-hosted, multi-gpu]
run_tests_tf_gpu:
runs-on: [self-hosted, single-gpu]
steps:
- uses: actions/checkout@v2
- name: Python version
@@ -84,7 +81,57 @@ jobs:
id: cache
with:
path: .env
key: v1-tests_tf_torch_multiple_gpu-${{ hashFiles('setup.py') }}
key: v1.1-tests_tf_gpu-${{ hashFiles('setup.py') }}
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
run: |
python -m venv .env
source .env/bin/activate
which python
python --version
pip --version
- name: Install dependencies
run: |
source .env/bin/activate
pip install --upgrade pip
pip install .[tf,sklearn,testing,onnxruntime]
pip install git+https://github.com/huggingface/datasets
- name: Are GPUs recognized by our DL frameworks
run: |
source .env/bin/activate
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
- name: Run all non-slow tests on GPU
env:
OMP_NUM_THREADS: 1
CUDA_VISIBLE_DEVICES: 0
run: |
source .env/bin/activate
python -m pytest -n 2 --dist=loadfile -s ./tests/
run_tests_torch_multiple_gpu:
runs-on: [self-hosted, multi-gpu]
steps:
- uses: actions/checkout@v2
- name: Python version
run: |
which python
python --version
pip --version
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Loading cache.
uses: actions/cache@v2
id: cache
with:
path: .env
key: v1.1-tests_torch_multiple_gpu-${{ hashFiles('setup.py') }}
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
run: |
@@ -97,8 +144,7 @@ jobs:
run: |
source .env/bin/activate
pip install --upgrade pip
pip install torch!=1.6.0
pip install .[sklearn,testing,onnxruntime]
pip install .[torch,sklearn,testing,onnxruntime]
pip install git+https://github.com/huggingface/datasets
- name: Are GPUs recognized by our DL frameworks
@@ -109,8 +155,54 @@ jobs:
- name: Run all non-slow tests on GPU
env:
TF_FORCE_GPU_ALLOW_GROWTH: "true"
# TF_GPU_MEMORY_LIMIT: 4096
OMP_NUM_THREADS: 1
run: |
source .env/bin/activate
python -m pytest -n 2 --dist=loadfile -s ./tests/
run_tests_tf_multiple_gpu:
runs-on: [self-hosted, multi-gpu]
steps:
- uses: actions/checkout@v2
- name: Python version
run: |
which python
python --version
pip --version
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Loading cache.
uses: actions/cache@v2
id: cache
with:
path: .env
key: v1.1-tests_tf_multiple_gpu-${{ hashFiles('setup.py') }}
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
run: |
python -m venv .env
source .env/bin/activate
which python
python --version
pip --version
- name: Install dependencies
run: |
source .env/bin/activate
pip install --upgrade pip
pip install .[tf,sklearn,testing,onnxruntime]
pip install git+https://github.com/huggingface/datasets
- name: Are GPUs recognized by our DL frameworks
run: |
source .env/bin/activate
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
- name: Run all non-slow tests on GPU
env:
OMP_NUM_THREADS: 1
run: |
source .env/bin/activate

View File

@@ -9,7 +9,7 @@ on:
- cron: "0 0 * * *"
jobs:
run_all_tests_torch_and_tf_gpu:
run_all_tests_torch_gpu:
runs-on: [self-hosted, single-gpu]
steps:
- uses: actions/checkout@v2
@@ -19,7 +19,7 @@ jobs:
id: cache
with:
path: .env
key: v1-slow_tests_tf_torch_gpu-${{ hashFiles('setup.py') }}
key: v 1.1-slow_tests_torch_gpu-${{ hashFiles('setup.py') }}
- name: Python version
run: |
@@ -44,9 +44,9 @@ jobs:
run: |
source .env/bin/activate
pip install --upgrade pip
pip install torch!=1.6.0
pip install .[sklearn,testing,onnxruntime]
pip install .[torch,sklearn,testing,onnxruntime]
pip install git+https://github.com/huggingface/datasets
pip list
- name: Are GPUs recognized by our DL frameworks
run: |
@@ -56,31 +56,29 @@ jobs:
- name: Run all tests on GPU
env:
TF_FORCE_GPU_ALLOW_GROWTH: "true"
OMP_NUM_THREADS: 1
RUN_SLOW: yes
run: |
source .env/bin/activate
python -m pytest -n 1 --dist=loadfile -s --make_reports=tests tests
python -m pytest -n 1 --dist=loadfile -s --make_reports=tests_torch tests
- name: Failure short reports
if: ${{ always() }}
run: cat reports/report_tests_failures_short.txt
run: cat reports/report_test_torch_failures_short.txt
- name: Run examples tests on GPU
if: ${{ always() }}
env:
TF_FORCE_GPU_ALLOW_GROWTH: "true"
OMP_NUM_THREADS: 1
RUN_SLOW: yes
run: |
source .env/bin/activate
pip install -r examples/requirements.txt
python -m pytest -n 1 --dist=loadfile -s --make_reports=examples examples
python -m pytest -n 1 --dist=loadfile -s --make_reports=examples_torch examples
- name: Failure short reports
if: ${{ always() }}
run: cat reports/report_examples_failures_short.txt
run: cat reports/report_examples_torch_failures_short.txt
- name: Run all pipeline tests on GPU
if: ${{ always() }}
@@ -91,21 +89,85 @@ jobs:
RUN_PIPELINE_TESTS: yes
run: |
source .env/bin/activate
python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make_reports=tests_pipeline tests
python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make_reports=tests_torch_pipeline tests
- name: Failure short reports
if: ${{ always() }}
run: cat reports/report_tests_pipeline_failures_short.txt
run: cat reports/report_tests_torch_pipeline_failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: run_all_tests_torch_and_tf_gpu_test_reports
name: run_all_tests_torch_gpu_test_reports
path: reports
run_all_tests_torch_and_tf_multiple_gpu:
run_all_tests_tf_gpu:
runs-on: [self-hosted, single-gpu]
steps:
- uses: actions/checkout@v2
- name: Loading cache.
uses: actions/cache@v2
id: cache
with:
path: .env
key: v1.1-slow_tests_tf_gpu-${{ hashFiles('setup.py') }}
- name: Python version
run: |
which python
python --version
pip --version
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
if: steps.cache.outputs.cache-hit != 'true'
run: |
python -m venv .env
source .env/bin/activate
which python
python --version
pip --version
- name: Install dependencies
run: |
source .env/bin/activate
pip install --upgrade pip
pip install .[tf,sklearn,testing,onnxruntime]
pip install git+https://github.com/huggingface/datasets
pip list
- name: Are GPUs recognized by our DL frameworks
run: |
source .env/bin/activate
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
- name: Run all tests on GPU
env:
OMP_NUM_THREADS: 1
RUN_SLOW: yes
run: |
source .env/bin/activate
python -m pytest -n 1 --dist=loadfile -s --make_reports=tests_tf tests
- name: Failure short reports
if: ${{ always() }}
run: cat reports/report_test_tf_failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: run_all_tests_tf_gpu_test_reports
path: reports
run_all_tests_torch_multiple_gpu:
runs-on: [self-hosted, multi-gpu]
steps:
- uses: actions/checkout@v2
@@ -115,16 +177,18 @@ jobs:
id: cache
with:
path: .env
key: v1-slow_tests_tf_torch_multi_gpu-${{ hashFiles('setup.py') }}
key: v0.1-slow_tests_torch_multi_gpu-${{ hashFiles('setup.py') }}
- name: Python version
run: |
which python
python --version
pip --version
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
if: steps.cache.outputs.cache-hit != 'true'
run: |
@@ -133,13 +197,14 @@ jobs:
which python
python --version
pip --version
- name: Install dependencies
run: |
source .env/bin/activate
pip install --upgrade pip
pip install torch!=1.6.0
pip install .[sklearn,testing,onnxruntime]
pip install .[torch,sklearn,testing,onnxruntime]
pip install git+https://github.com/huggingface/datasets
pip list
- name: Are GPUs recognized by our DL frameworks
run: |
@@ -149,22 +214,104 @@ jobs:
- name: Run all tests on GPU
env:
TF_FORCE_GPU_ALLOW_GROWTH: "true"
OMP_NUM_THREADS: 1
RUN_SLOW: yes
run: |
source .env/bin/activate
python -m pytest -n 1 --dist=loadfile -s ./tests/ --durations=50
python -m pytest -n 1 --dist=loadfile -s --make_reports=tests_torch tests
- name: Failure short reports
if: ${{ always() }}
run: cat reports/report_test_torch_failures_short.txt
- name: Run examples tests on GPU
if: ${{ always() }}
env:
TF_FORCE_GPU_ALLOW_GROWTH: "true"
OMP_NUM_THREADS: 1
RUN_SLOW: yes
run: |
source .env/bin/activate
pip install -r examples/requirements.txt
python -m pytest -n 1 --dist=loadfile -s examples --durations=50
python -m pytest -n 1 --dist=loadfile -s --make_reports=examples_torch examples
- name: Failure short reports
if: ${{ always() }}
run: cat reports/report_examples_torch_failures_short.txt
- name: Run all pipeline tests on GPU
if: ${{ always() }}
env:
TF_FORCE_GPU_ALLOW_GROWTH: "true"
OMP_NUM_THREADS: 1
RUN_SLOW: yes
RUN_PIPELINE_TESTS: yes
run: |
source .env/bin/activate
python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make_reports=tests_torch_pipeline tests
- name: Failure short reports
if: ${{ always() }}
run: cat reports/report_tests_torch_pipeline_failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: run_all_tests_torch_multi_gpu_test_reports
path: reports
run_all_tests_tf_multiple_gpu:
runs-on: [self-hosted, multi-gpu]
steps:
- uses: actions/checkout@v2
- name: Loading cache.
uses: actions/cache@v2
id: cache
with:
path: .env
key: v0.1-slow_tests_tf_multi_gpu-${{ hashFiles('setup.py') }}
- name: Python version
run: |
which python
python --version
pip --version
- name: Current dir
run: pwd
- run: nvidia-smi
- name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
if: steps.cache.outputs.cache-hit != 'true'
run: |
python -m venv .env
source .env/bin/activate
which python
python --version
pip --version
- name: Install dependencies
run: |
source .env/bin/activate
pip install --upgrade pip
pip install .[tf,sklearn,testing,onnxruntime]
pip install git+https://github.com/huggingface/datasets
pip list
- name: Are GPUs recognized by our DL frameworks
run: |
source .env/bin/activate
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
- name: Run all tests on GPU
env:
OMP_NUM_THREADS: 1
RUN_SLOW: yes
run: |
source .env/bin/activate
python -m pytest -n 1 --dist=loadfile -s ./tests/ --durations=0
- name: Run all pipeline tests on GPU
env:
@@ -175,3 +322,15 @@ jobs:
run: |
source .env/bin/activate
python -m pytest -n 1 --dist=loadfile -s ./tests/ -m is_pipeline_test --durations=50
- name: Failure short reports
if: ${{ always() }}
run: cat reports/report_test_tf_failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: run_all_tests_tf_multi_gpu_test_reports
path: reports