Ci test tf super slow (#8007)

* Test TF GPU CI * Change cache * Fix missing torch requirement * Fix some model tests Style * LXMERT * MobileBERT * Longformer skip test * XLNet * The rest of the tests * RAG goes OOM in multi gpu setup * YAML test files * Last fixes * Skip doctests * Fill mask tests * Yaml files * Last test fix * Style * Update cache * Change ONNX tests to slow + use tiny model
2020-10-30 14:25:48 +00:00
parent 7e36deec7a
commit 10f8c63620
25 changed files with 562 additions and 126 deletions
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -13,7 +13,7 @@ on:


 jobs:
-  run_tests_torch_and_tf_gpu:
+  run_tests_torch_gpu:
    runs-on: [self-hosted, single-gpu]
    steps:
      - uses: actions/checkout@v2
@@ -32,7 +32,7 @@ jobs:
        id: cache
        with:
          path: .env
-          key: v1-tests_tf_torch_gpu-${{ hashFiles('setup.py') }}
+          key: v1.1-tests_torch_gpu-${{ hashFiles('setup.py') }}

      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
        run: |
@@ -46,8 +46,7 @@ jobs:
        run: |
          source .env/bin/activate
          pip install --upgrade pip
-          pip install torch!=1.6.0
-          pip install .[sklearn,testing,onnxruntime]
+          pip install .[torch,sklearn,testing,onnxruntime]
          pip install git+https://github.com/huggingface/datasets

      - name: Are GPUs recognized by our DL frameworks
@@ -58,16 +57,14 @@ jobs:

      - name: Run all non-slow tests on GPU
        env:
-          TF_FORCE_GPU_ALLOW_GROWTH: "true"
-          # TF_GPU_MEMORY_LIMIT: 4096
          OMP_NUM_THREADS: 1
+          CUDA_VISIBLE_DEVICES: 0
        run: |
          source .env/bin/activate
-          python -m pytest -n 2 --dist=loadfile -s tests
+          python -m pytest -n 2 --dist=loadfile -s ./tests/

-
-  run_tests_torch_and_tf_multiple_gpu:
-    runs-on: [self-hosted, multi-gpu]
+  run_tests_tf_gpu:
+    runs-on: [self-hosted, single-gpu]
    steps:
      - uses: actions/checkout@v2
      - name: Python version
@@ -84,7 +81,57 @@ jobs:
        id: cache
        with:
          path: .env
-          key: v1-tests_tf_torch_multiple_gpu-${{ hashFiles('setup.py') }}
+          key: v1.1-tests_tf_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version
+
+      - name: Install dependencies
+        run: |
+          source .env/bin/activate
+          pip install --upgrade pip
+          pip install .[tf,sklearn,testing,onnxruntime]
+          pip install git+https://github.com/huggingface/datasets
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          source .env/bin/activate
+          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
+          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
+
+      - name: Run all non-slow tests on GPU
+        env:
+          OMP_NUM_THREADS: 1
+          CUDA_VISIBLE_DEVICES: 0
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 2 --dist=loadfile -s ./tests/
+
+  run_tests_torch_multiple_gpu:
+    runs-on: [self-hosted, multi-gpu]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Python version
+        run: |
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-tests_torch_multiple_gpu-${{ hashFiles('setup.py') }}

      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
        run: |
@@ -97,8 +144,7 @@ jobs:
        run: |
          source .env/bin/activate
          pip install --upgrade pip
-          pip install torch!=1.6.0
-          pip install .[sklearn,testing,onnxruntime]
+          pip install .[torch,sklearn,testing,onnxruntime]
          pip install git+https://github.com/huggingface/datasets

      - name: Are GPUs recognized by our DL frameworks
@@ -109,8 +155,54 @@ jobs:

      - name: Run all non-slow tests on GPU
        env:
-          TF_FORCE_GPU_ALLOW_GROWTH: "true"
-          # TF_GPU_MEMORY_LIMIT: 4096
+          OMP_NUM_THREADS: 1
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 2 --dist=loadfile -s ./tests/
+
+  run_tests_tf_multiple_gpu:
+    runs-on: [self-hosted, multi-gpu]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Python version
+        run: |
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-tests_tf_multiple_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version
+      - name: Install dependencies
+        run: |
+          source .env/bin/activate
+          pip install --upgrade pip
+          pip install .[tf,sklearn,testing,onnxruntime]
+          pip install git+https://github.com/huggingface/datasets
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          source .env/bin/activate
+          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
+          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
+
+      - name: Run all non-slow tests on GPU
+        env:
          OMP_NUM_THREADS: 1
        run: |
          source .env/bin/activate
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -9,7 +9,7 @@ on:
    - cron: "0 0 * * *"

 jobs:
-  run_all_tests_torch_and_tf_gpu:
+  run_all_tests_torch_gpu:
    runs-on: [self-hosted, single-gpu]
    steps:
      - uses: actions/checkout@v2
@@ -19,7 +19,7 @@ jobs:
        id: cache
        with:
          path: .env
-          key: v1-slow_tests_tf_torch_gpu-${{ hashFiles('setup.py') }}
+          key: v  1.1-slow_tests_torch_gpu-${{ hashFiles('setup.py') }}

      - name: Python version
        run: |
@@ -44,9 +44,9 @@ jobs:
        run: |
          source .env/bin/activate
          pip install --upgrade pip
-          pip install torch!=1.6.0
-          pip install .[sklearn,testing,onnxruntime]
+          pip install .[torch,sklearn,testing,onnxruntime]
          pip install git+https://github.com/huggingface/datasets
+          pip list

      - name: Are GPUs recognized by our DL frameworks
        run: |
@@ -56,31 +56,29 @@ jobs:

      - name: Run all tests on GPU
        env:
-          TF_FORCE_GPU_ALLOW_GROWTH: "true"
          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
        run: |
          source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s --make_reports=tests tests
+          python -m pytest -n 1 --dist=loadfile -s --make_reports=tests_torch tests

      - name: Failure short reports
        if: ${{ always() }}
-        run: cat reports/report_tests_failures_short.txt
+        run: cat reports/report_test_torch_failures_short.txt
        
      - name: Run examples tests on GPU
        if: ${{ always() }}
        env:
-          TF_FORCE_GPU_ALLOW_GROWTH: "true"
          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
        run: |
          source .env/bin/activate
          pip install -r examples/requirements.txt
-          python -m pytest -n 1 --dist=loadfile -s --make_reports=examples examples
+          python -m pytest -n 1 --dist=loadfile -s --make_reports=examples_torch examples

      - name: Failure short reports
        if: ${{ always() }}
-        run: cat reports/report_examples_failures_short.txt
+        run: cat reports/report_examples_torch_failures_short.txt

      - name: Run all pipeline tests on GPU
        if: ${{ always() }}
@@ -91,21 +89,85 @@ jobs:
          RUN_PIPELINE_TESTS: yes
        run: |
          source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make_reports=tests_pipeline tests
+          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make_reports=tests_torch_pipeline tests

      - name: Failure short reports
        if: ${{ always() }}
-        run: cat reports/report_tests_pipeline_failures_short.txt
+        run: cat reports/report_tests_torch_pipeline_failures_short.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
-          name: run_all_tests_torch_and_tf_gpu_test_reports
+          name: run_all_tests_torch_gpu_test_reports
          path: reports


-  run_all_tests_torch_and_tf_multiple_gpu:
+  run_all_tests_tf_gpu:
+    runs-on: [self-hosted, single-gpu]
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-slow_tests_tf_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Python version
+        run: |
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        if: steps.cache.outputs.cache-hit != 'true'
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version
+
+      - name: Install dependencies
+        run: |
+          source .env/bin/activate
+          pip install --upgrade pip
+          pip install .[tf,sklearn,testing,onnxruntime]
+          pip install git+https://github.com/huggingface/datasets
+          pip list
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          source .env/bin/activate
+          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
+          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
+
+      - name: Run all tests on GPU
+        env:
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s --make_reports=tests_tf tests
+          
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/report_test_tf_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_tf_gpu_test_reports
+          path: reports
+          
+  run_all_tests_torch_multiple_gpu:
    runs-on: [self-hosted, multi-gpu]
    steps:
      - uses: actions/checkout@v2
@@ -115,16 +177,18 @@ jobs:
        id: cache
        with:
          path: .env
-          key: v1-slow_tests_tf_torch_multi_gpu-${{ hashFiles('setup.py') }}
+          key: v0.1-slow_tests_torch_multi_gpu-${{ hashFiles('setup.py') }}

      - name: Python version
        run: |
          which python
          python --version
          pip --version
+
      - name: Current dir
        run: pwd
      - run: nvidia-smi
+
      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
        if: steps.cache.outputs.cache-hit != 'true'
        run: |
@@ -133,13 +197,14 @@ jobs:
          which python
          python --version
          pip --version
+
      - name: Install dependencies
        run: |
          source .env/bin/activate
          pip install --upgrade pip
-          pip install torch!=1.6.0
-          pip install .[sklearn,testing,onnxruntime]
+          pip install .[torch,sklearn,testing,onnxruntime]
          pip install git+https://github.com/huggingface/datasets
+          pip list

      - name: Are GPUs recognized by our DL frameworks
        run: |
@@ -149,22 +214,104 @@ jobs:

      - name: Run all tests on GPU
        env:
-          TF_FORCE_GPU_ALLOW_GROWTH: "true"
          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
        run: |
          source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s ./tests/ --durations=50
+          python -m pytest -n 1 --dist=loadfile -s --make_reports=tests_torch tests

+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/report_test_torch_failures_short.txt
+        
      - name: Run examples tests on GPU
+        if: ${{ always() }}
        env:
-          TF_FORCE_GPU_ALLOW_GROWTH: "true"
          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
        run: |
          source .env/bin/activate
          pip install -r examples/requirements.txt
-          python -m pytest -n 1 --dist=loadfile -s examples --durations=50
+          python -m pytest -n 1 --dist=loadfile -s --make_reports=examples_torch examples
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/report_examples_torch_failures_short.txt
+
+      - name: Run all pipeline tests on GPU
+        if: ${{ always() }}
+        env:
+          TF_FORCE_GPU_ALLOW_GROWTH: "true"
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+          RUN_PIPELINE_TESTS: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make_reports=tests_torch_pipeline tests
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/report_tests_torch_pipeline_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_torch_multi_gpu_test_reports
+          path: reports
+
+  run_all_tests_tf_multiple_gpu:
+    runs-on: [self-hosted, multi-gpu]
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v0.1-slow_tests_tf_multi_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Python version
+        run: |
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        if: steps.cache.outputs.cache-hit != 'true'
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version
+
+      - name: Install dependencies
+        run: |
+          source .env/bin/activate
+          pip install --upgrade pip
+          pip install .[tf,sklearn,testing,onnxruntime]
+          pip install git+https://github.com/huggingface/datasets
+          pip list
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          source .env/bin/activate
+          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
+          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
+
+      - name: Run all tests on GPU
+        env:
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s ./tests/ --durations=0

      - name: Run all pipeline tests on GPU
        env:
@@ -175,3 +322,15 @@ jobs:
        run: |
          source .env/bin/activate
          python -m pytest -n 1 --dist=loadfile -s ./tests/ -m is_pipeline_test --durations=50
+          
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/report_test_tf_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_tf_multi_gpu_test_reports
+          path: reports
+