Tests run on Docker (#10681)

* Tests run on Docker Co-authored-by: Morgan <funtowiczmo@gmail.com> * Comments from code review * Reply to itself * Dependencies Co-authored-by: Morgan <funtowiczmo@gmail.com>
2021-03-15 17:28:01 -04:00
parent d41dd5359b
commit 58f672e65c
7 changed files with 414 additions and 373 deletions
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -1,8 +1,3 @@
-# configuration notes:
-#
-# - `source .env/bin/activate` is currently needed to be run first thing first in each step. Otherwise
-# the step uses the system-wide python interpreter.
-
 name: Self-hosted runner (scheduled)

 on:
@@ -15,61 +10,39 @@ on:

 jobs:
  run_all_tests_torch_gpu:
-    runs-on: [self-hosted, gpu, single-gpu]
+    runs-on: [self-hosted, docker-gpu, single-gpu]
+    container:
+      image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
-      - uses: actions/checkout@v2
+      - name: Launcher docker
+        uses: actions/checkout@v2

-      - name: Loading cache.
-        uses: actions/cache@v2
-        id: cache
-        with:
-          path: .env
-          key: v  1.2-slow_tests_torch_gpu-${{ hashFiles('setup.py') }}
-
-      - name: Python version
+      - name: NVIDIA-SMI
        run: |
-          which python
-          python --version
-          pip --version
-
-      - name: Current dir
-        run: pwd
-
-      - run: nvidia-smi
-
-      - name: Kill any run-away pytest processes
-        run: (pkill -f tests; pkill -f examples) || echo "no zombies"
-
-      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
-        if: steps.cache.outputs.cache-hit != 'true'
-        run: |
-          python -m venv .env
-          source .env/bin/activate
-          which python
-          python --version
-          pip --version
+          nvidia-smi

      - name: Install dependencies
        run: |
-          source .env/bin/activate
+          apt -y update && apt install -y libsndfile1-dev
          pip install --upgrade pip
-          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
-          pip install git+https://github.com/huggingface/datasets
-          pip list
+          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]

      - name: Are GPUs recognized by our DL frameworks
        run: |
-          source .env/bin/activate
          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+          python -c "import torch; print('Cuda version:', torch.version.cuda)"
+          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

      - name: Run all tests on GPU
        env:
-          OMP_NUM_THREADS: 1
+          OMP_NUM_THREADS: 16
+          MKL_NUM_THREADS: 16
          RUN_SLOW: yes
+          HF_HOME: /mnt/cache
        run: |
-          source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_gpu tests
+          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -78,12 +51,13 @@ jobs:
      - name: Run examples tests on GPU
        if: ${{ always() }}
        env:
-          OMP_NUM_THREADS: 1
+          OMP_NUM_THREADS: 16
+          MKL_NUM_THREADS: 16
          RUN_SLOW: yes
+          HF_HOME: /mnt/cache
        run: |
-          source .env/bin/activate
          pip install -r examples/_tests_requirements.txt
-          python -m pytest -n 1 --dist=loadfile -s --make-reports=examples_torch_gpu examples
+          python -m pytest -n 1 --dist=loadfile --make-reports=examples_torch_gpu examples

      - name: Failure short reports
        if: ${{ always() }}
@@ -92,13 +66,13 @@ jobs:
      - name: Run all pipeline tests on GPU
        if: ${{ always() }}
        env:
-          TF_FORCE_GPU_ALLOW_GROWTH: "true"
-          OMP_NUM_THREADS: 1
+          OMP_NUM_THREADS: 16
+          MKL_NUM_THREADS: 16
          RUN_SLOW: yes
          RUN_PIPELINE_TESTS: yes
+          HF_HOME: /mnt/cache
        run: |
-          source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
+          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -111,64 +85,39 @@ jobs:
          name: run_all_tests_torch_gpu_test_reports
          path: reports

-
  run_all_tests_tf_gpu:
-    runs-on: [self-hosted, gpu, single-gpu]
+    runs-on: [self-hosted, docker-gpu, single-gpu]
+    container:
+      image: tensorflow/tensorflow:2.4.1-gpu
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
-      - uses: actions/checkout@v2
+      - name: Launcher docker
+        uses: actions/checkout@v2

-      - name: Loading cache.
-        uses: actions/cache@v2
-        id: cache
-        with:
-          path: .env
-          key: v1.2-slow_tests_tf_gpu-${{ hashFiles('setup.py') }}
-
-      - name: Python version
+      - name: NVIDIA-SMI
        run: |
-          which python
-          python --version
-          pip --version
-
-      - name: Current dir
-        run: pwd
-
-      - run: nvidia-smi
-
-      - name: Kill any run-away pytest processes
-        run: (pkill -f tests; pkill -f examples) || echo "no zombies"
-
-
-      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
-        if: steps.cache.outputs.cache-hit != 'true'
-        run: |
-          python -m venv .env
-          source .env/bin/activate
-          which python
-          python --version
-          pip --version
+          nvidia-smi

      - name: Install dependencies
        run: |
-          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
-          pip install git+https://github.com/huggingface/datasets
-          pip list
+          pip install .[sklearn,testing,onnx,sentencepiece]

      - name: Are GPUs recognized by our DL frameworks
        run: |
-          source .env/bin/activate
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"

      - name: Run all tests on GPU
        env:
-          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
+          HF_HOME: /mnt/cache
+          OMP_NUM_THREADS: 16
+          TF_NUM_INTEROP_THREADS: 1
+          TF_NUM_INTRAOP_THREADS: 16
+          MKL_NUM_THREADS: 16
        run: |
-          source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_gpu tests
+          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -177,17 +126,19 @@ jobs:
      - name: Run all pipeline tests on GPU
        if: ${{ always() }}
        env:
-          TF_FORCE_GPU_ALLOW_GROWTH: "true"
-          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
+          HF_HOME: /mnt/cache
+          OMP_NUM_THREADS: 16
          RUN_PIPELINE_TESTS: yes
+          TF_NUM_INTEROP_THREADS: 1
+          TF_NUM_INTRAOP_THREADS: 16
+          MKL_NUM_THREADS: 16
        run: |
-          source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipelines_gpu tests
+          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
-        run: cat reports/tests_tf_pipelines_gpu_failures_short.txt
+        run: cat reports/tests_tf_pipeline_gpu_failures_short.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
@@ -197,92 +148,55 @@ jobs:
          path: reports

  run_all_tests_torch_multi_gpu:
-    runs-on: [self-hosted, gpu, multi-gpu]
+    runs-on: [self-hosted, docker-gpu, multi-gpu]
+    container:
+      image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
-      - uses: actions/checkout@v2
+      - name: Launcher docker
+        uses: actions/checkout@v2

-      - name: Loading cache.
-        uses: actions/cache@v2
-        id: cache
-        with:
-          path: .env
-          key: v1.2-slow_tests_torch_multi_gpu-${{ hashFiles('setup.py') }}
-
-      - name: Python version
+      - name: NVIDIA-SMI
        run: |
-          which python
-          python --version
-          pip --version
-
-      - name: Current dir
-        run: pwd
-
-      - run: nvidia-smi
-
-      - name: Kill any run-away pytest processes
-        run: (pkill -f tests; pkill -f examples) || echo "no zombies"
-
-      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
-        if: steps.cache.outputs.cache-hit != 'true'
-        run: |
-          python -m venv .env
-          source .env/bin/activate
-          which python
-          python --version
-          pip --version
+          nvidia-smi

      - name: Install dependencies
        run: |
-          source .env/bin/activate
+          apt -y update && apt install -y libsndfile1-dev
          pip install --upgrade pip
-          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
-          pip install git+https://github.com/huggingface/datasets
-          pip install fairscale
-          pip install deepspeed
-          pip list
+          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]

      - name: Are GPUs recognized by our DL frameworks
        run: |
-          source .env/bin/activate
          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+          python -c "import torch; print('Cuda version:', torch.version.cuda)"
+          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

-      - name: Run all tests on multi-GPU
+      - name: Run all tests on GPU
        env:
-          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
+          HF_HOME: /mnt/cache
+          OMP_NUM_THREADS: 16
+          MKL_NUM_THREADS: 16
+          MKL_SERVICE_FORCE_INTEL: 1
        run: |
-          source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_multi_gpu tests
+          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_multi_gpu_failures_short.txt

-      - name: Run examples tests on multi-GPU
+      - name: Run all pipeline tests on GPU
        if: ${{ always() }}
        env:
-          OMP_NUM_THREADS: 1
-          RUN_SLOW: yes
-        run: |
-          source .env/bin/activate
-          pip install -r examples/_tests_requirements.txt
-          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_examples_multi_gpu examples
-
-      - name: Failure short reports
-        if: ${{ always() }}
-        run: cat reports/tests_torch_examples_multi_gpu_failures_short.txt
-
-      - name: Run all pipeline tests on multi-GPU
-        if: ${{ always() }}
-        env:
-          TF_FORCE_GPU_ALLOW_GROWTH: "true"
-          OMP_NUM_THREADS: 1
+          OMP_NUM_THREADS: 16
+          MKL_NUM_THREADS: 16
          RUN_SLOW: yes
          RUN_PIPELINE_TESTS: yes
+          HF_HOME: /mnt/cache
        run: |
-          source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
+          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -296,76 +210,55 @@ jobs:
          path: reports

  run_all_tests_tf_multi_gpu:
-    runs-on: [self-hosted, gpu, multi-gpu]
+    runs-on: [self-hosted, docker-gpu, multi-gpu]
+    container:
+      image: tensorflow/tensorflow:2.4.1-gpu
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
-      - uses: actions/checkout@v2
+      - name: Launcher docker
+        uses: actions/checkout@v2

-      - name: Loading cache.
-        uses: actions/cache@v2
-        id: cache
-        with:
-          path: .env
-          key: v1.2-slow_tests_tf_multi_gpu-${{ hashFiles('setup.py') }}
-
-      - name: Python version
+      - name: NVIDIA-SMI
        run: |
-          which python
-          python --version
-          pip --version
-
-      - name: Current dir
-        run: pwd
-
-      - run: nvidia-smi
-
-      - name: Kill any run-away pytest processes
-        run: (pkill -f tests; pkill -f examples) || echo "no zombies"
-
-      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
-        if: steps.cache.outputs.cache-hit != 'true'
-        run: |
-          python -m venv .env
-          source .env/bin/activate
-          which python
-          python --version
-          pip --version
+          nvidia-smi

      - name: Install dependencies
        run: |
-          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
-          pip install git+https://github.com/huggingface/datasets
-          pip list
+          pip install .[sklearn,testing,onnx,sentencepiece]

      - name: Are GPUs recognized by our DL frameworks
        run: |
-          source .env/bin/activate
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"

-      - name: Run all tests on multi-GPU
+      - name: Run all tests on GPU
        env:
-          OMP_NUM_THREADS: 1
+          OMP_NUM_THREADS: 16
          RUN_SLOW: yes
+          MKL_NUM_THREADS: 16
+          TF_NUM_INTEROP_THREADS: 1
+          TF_NUM_INTRAOP_THREADS: 16
+          HF_HOME: /mnt/cache
        run: |
-          source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_multi_gpu tests
+          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_tf_multi_gpu_failures_short.txt

-      - name: Run all pipeline tests on multi-GPU
+      - name: Run all pipeline tests on GPU
        if: ${{ always() }}
        env:
-          TF_FORCE_GPU_ALLOW_GROWTH: "true"
-          OMP_NUM_THREADS: 1
+          OMP_NUM_THREADS: 16
          RUN_SLOW: yes
          RUN_PIPELINE_TESTS: yes
+          MKL_NUM_THREADS: 16
+          TF_NUM_INTEROP_THREADS: 1
+          TF_NUM_INTRAOP_THREADS: 16
+          HF_HOME: /mnt/cache
        run: |
-          source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests
+          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -377,3 +270,23 @@ jobs:
        with:
          name: run_all_tests_tf_multi_gpu_test_reports
          path: reports
+
+  send_results:
+    name: Send results to webhook
+    runs-on: ubuntu-latest
+    if: always()
+    needs: [run_all_tests_torch_gpu, run_all_tests_tf_gpu, run_all_tests_torch_multi_gpu, run_all_tests_tf_multi_gpu]
+    steps:
+      - uses: actions/checkout@v2
+
+      - uses: actions/download-artifact@v2
+
+      - name: Send message to Slack
+        env:
+          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
+          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
+
+
+        run: |
+          pip install slack_sdk
+          python utils/notification_service.py scheduled