diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml
index 5f408e88fc..8af6f8ea5c 100644
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -10,73 +10,42 @@ on:
       - "tests/**"
       - ".github/**"
       - "templates/**"
-  # pull_request:
   repository_dispatch:
 
-
 jobs:
   run_tests_torch_gpu:
-    runs-on: [self-hosted, gpu, single-gpu]
+    runs-on: [self-hosted, docker-gpu, single-gpu]
+    container:
+      image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     steps:
-      - uses: actions/checkout@v2
-      - name: Python version
+      - name: Launcher docker
+        uses: actions/checkout@v2
+
+      - name: NVIDIA-SMI
         run: |
-          which python
-          python --version
-          pip --version
-
-      - name: Current dir
-        run: pwd
-
-      - run: nvidia-smi
-
-      - name: Kill any run-away pytest processes
-        run: (pkill -f tests; pkill -f examples) || echo "no zombies"
-
-      - name: Loading cache.
-        uses: actions/cache@v2
-        id: cache
-        with:
-          path: .env
-          key: v1.2-tests_torch_gpu-${{ hashFiles('setup.py') }}
-
-      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
-        run: |
-          python -m venv .env
-          source .env/bin/activate
-          which python
-          python --version
-          pip --version
+          nvidia-smi
 
       - name: Install dependencies
         run: |
-          source .env/bin/activate
-          sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+          apt -y update && apt install -y libsndfile1-dev
           pip install --upgrade pip
-          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece,speech]
-          pip install git+https://github.com/huggingface/datasets
+          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
 
       - name: Are GPUs recognized by our DL frameworks
         run: |
-          source .env/bin/activate
           python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+          python -c "import torch; print('Cuda version:', torch.version.cuda)"
+          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
           python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
 
-#      - name: Create model files
-#        run: |
-#          source .env/bin/activate
-#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
-#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
-#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
-#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
-
       - name: Run all non-slow tests on GPU
         env:
-          OMP_NUM_THREADS: 1
-          CUDA_VISIBLE_DEVICES: 0
+          OMP_NUM_THREADS: 8
+          MKL_NUM_THREADS: 8
+          HF_HOME: /mnt/cache
         run: |
-          source .env/bin/activate
-          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_torch_gpu tests
+          python -m pytest -n 2 --dist=loadfile --make-reports=tests_torch_gpu tests
 
       - name: Failure short reports
         if: ${{ always() }}
@@ -89,68 +58,38 @@ jobs:
           name: run_all_tests_torch_gpu_test_reports
           path: reports
 
-
   run_tests_tf_gpu:
-    runs-on: [self-hosted, gpu, single-gpu]
+    runs-on: [self-hosted, docker-gpu, single-gpu]
+    container:
+      image: tensorflow/tensorflow:2.4.1-gpu
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     steps:
-      - uses: actions/checkout@v2
-      - name: Python version
+      - name: Launcher docker
+        uses: actions/checkout@v2
+
+      - name: NVIDIA-SMI
         run: |
-          which python
-          python --version
-          pip --version
-
-      - name: Current dir
-        run: pwd
-
-      - run: nvidia-smi
-
-      - name: Kill any run-away pytest processes
-        run: (pkill -f tests; pkill -f examples) || echo "no zombies"
-
-      - name: Loading cache.
-        uses: actions/cache@v2
-        id: cache
-        with:
-          path: .env
-          key: v1.2-tests_tf_gpu-${{ hashFiles('setup.py') }}
-
-      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
-        run: |
-          python -m venv .env
-          source .env/bin/activate
-          which python
-          python --version
-          pip --version
+          nvidia-smi
 
       - name: Install dependencies
         run: |
-          source .env/bin/activate
           pip install --upgrade pip
-          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
-          pip install git+https://github.com/huggingface/datasets
+          pip install .[sklearn,testing,onnxruntime,sentencepiece]
 
       - name: Are GPUs recognized by our DL frameworks
         run: |
-          source .env/bin/activate
           TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
           TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
 
-      - name: Create model files
-        run: |
-          source .env/bin/activate
-#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
-#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
-#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
-#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
-
       - name: Run all non-slow tests on GPU
         env:
-          OMP_NUM_THREADS: 1
-          CUDA_VISIBLE_DEVICES: 0
+          OMP_NUM_THREADS: 8
+          MKL_NUM_THREADS: 8
+          TF_NUM_INTRAOP_THREADS: 8
+          TF_NUM_INTEROP_THREADS: 1
+          HF_HOME: /mnt/cache
         run: |
-          source .env/bin/activate
-          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_tf_gpu tests
+          python -m pytest -n 2 --dist=loadfile --make-reports=tests_tf_gpu tests
 
       - name: Failure short reports
         if: ${{ always() }}
@@ -163,58 +102,41 @@ jobs:
           name: run_all_tests_tf_gpu_test_reports
           path: reports
 
+
   run_tests_torch_multi_gpu:
-    runs-on: [self-hosted, gpu, multi-gpu]
+    runs-on: [self-hosted, docker-gpu, multi-gpu]
+    container:
+      image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     steps:
-      - uses: actions/checkout@v2
-      - name: Python version
+      - name: Launcher docker
+        uses: actions/checkout@v2
+
+      - name: NVIDIA-SMI
         run: |
-          which python
-          python --version
-          pip --version
+          nvidia-smi
 
-      - name: Current dir
-        run: pwd
-
-      - run: nvidia-smi
-
-      - name: Kill any run-away pytest processes
-        run: (pkill -f tests; pkill -f examples) || echo "no zombies"
-
-      - name: Loading cache.
-        uses: actions/cache@v2
-        id: cache
-        with:
-          path: .env
-          key: v1.2-tests_torch_multi_gpu-${{ hashFiles('setup.py') }}
-
-      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
-        run: |
-          python -m venv .env
-          source .env/bin/activate
-          which python
-          python --version
-          pip --version
       - name: Install dependencies
         run: |
-          source .env/bin/activate
-          sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+          apt -y update && apt install -y libsndfile1-dev
           pip install --upgrade pip
-          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece,speech]
-          pip install git+https://github.com/huggingface/datasets
+          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
 
       - name: Are GPUs recognized by our DL frameworks
         run: |
-          source .env/bin/activate
           python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+          python -c "import torch; print('Cuda version:', torch.version.cuda)"
+          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
           python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
 
       - name: Run all non-slow tests on GPU
         env:
-          OMP_NUM_THREADS: 1
+          OMP_NUM_THREADS: 8
+          MKL_NUM_THREADS: 8
+          MKL_SERVICE_FORCE_INTEL: 1
+          HF_HOME: /mnt/cache
         run: |
-          source .env/bin/activate
-          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_torch_multi_gpu tests
+          python -m pytest -n 2 --dist=loadfile --make-reports=tests_torch_multi_gpu tests
 
       - name: Failure short reports
         if: ${{ always() }}
@@ -228,56 +150,37 @@ jobs:
           path: reports
 
   run_tests_tf_multi_gpu:
-    runs-on: [self-hosted, gpu, multi-gpu]
+    runs-on: [self-hosted, docker-gpu, multi-gpu]
+    container:
+      image: tensorflow/tensorflow:2.4.1-gpu
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     steps:
-      - uses: actions/checkout@v2
-      - name: Python version
+      - name: Launcher docker
+        uses: actions/checkout@v2
+
+      - name: NVIDIA-SMI
         run: |
-          which python
-          python --version
-          pip --version
+          nvidia-smi
 
-      - name: Current dir
-        run: pwd
-
-      - run: nvidia-smi
-
-      - name: Kill any run-away pytest processes
-        run: (pkill -f tests; pkill -f examples) || echo "no zombies"
-
-      - name: Loading cache.
-        uses: actions/cache@v2
-        id: cache
-        with:
-          path: .env
-          key: v1.2-tests_tf_multi_gpu-${{ hashFiles('setup.py') }}
-
-      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
-        run: |
-          python -m venv .env
-          source .env/bin/activate
-          which python
-          python --version
-          pip --version
       - name: Install dependencies
         run: |
-          source .env/bin/activate
           pip install --upgrade pip
-          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
-          pip install git+https://github.com/huggingface/datasets
+          pip install .[sklearn,testing,onnxruntime,sentencepiece]
 
       - name: Are GPUs recognized by our DL frameworks
         run: |
-          source .env/bin/activate
           TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
           TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
 
       - name: Run all non-slow tests on GPU
         env:
-          OMP_NUM_THREADS: 1
+          OMP_NUM_THREADS: 8
+          MKL_NUM_THREADS: 8
+          TF_NUM_INTRAOP_THREADS: 8
+          TF_NUM_INTEROP_THREADS: 1
+          HF_HOME: /mnt/cache
         run: |
-          source .env/bin/activate
-          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_tf_multi_gpu tests
+          python -m pytest -n 2 --dist=loadfile --make-reports=tests_tf_multi_gpu tests
 
       - name: Failure short reports
         if: ${{ always() }}
@@ -289,3 +192,22 @@ jobs:
         with:
           name: run_all_tests_tf_multi_gpu_test_reports
           path: reports
+
+  send_results:
+    name: Send results to webhook
+    runs-on: ubuntu-latest
+    if: always()
+    needs: [run_tests_torch_gpu, run_tests_tf_gpu, run_tests_torch_multi_gpu, run_tests_tf_multi_gpu]
+    steps:
+      - uses: actions/checkout@v2
+
+      - uses: actions/download-artifact@v2
+
+      - name: Send message to Slack
+        env:
+          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
+          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
+
+        run: |
+          pip install slack_sdk
+          python utils/notification_service.py push
\ No newline at end of file
diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index 66e3487f39..5072041113 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -1,8 +1,3 @@
-# configuration notes:
-#
-# - `source .env/bin/activate` is currently needed to be run first thing first in each step. Otherwise
-# the step uses the system-wide python interpreter.
-
 name: Self-hosted runner (scheduled)
 
 on:
@@ -15,61 +10,39 @@ on:
 
 jobs:
   run_all_tests_torch_gpu:
-    runs-on: [self-hosted, gpu, single-gpu]
+    runs-on: [self-hosted, docker-gpu, single-gpu]
+    container:
+      image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     steps:
-      - uses: actions/checkout@v2
+      - name: Launcher docker
+        uses: actions/checkout@v2
 
-      - name: Loading cache.
-        uses: actions/cache@v2
-        id: cache
-        with:
-          path: .env
-          key: v  1.2-slow_tests_torch_gpu-${{ hashFiles('setup.py') }}
-
-      - name: Python version
+      - name: NVIDIA-SMI
         run: |
-          which python
-          python --version
-          pip --version
-
-      - name: Current dir
-        run: pwd
-
-      - run: nvidia-smi
-
-      - name: Kill any run-away pytest processes
-        run: (pkill -f tests; pkill -f examples) || echo "no zombies"
-
-      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
-        if: steps.cache.outputs.cache-hit != 'true'
-        run: |
-          python -m venv .env
-          source .env/bin/activate
-          which python
-          python --version
-          pip --version
+          nvidia-smi
 
       - name: Install dependencies
         run: |
-          source .env/bin/activate
+          apt -y update && apt install -y libsndfile1-dev
           pip install --upgrade pip
-          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
-          pip install git+https://github.com/huggingface/datasets
-          pip list
+          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
 
       - name: Are GPUs recognized by our DL frameworks
         run: |
-          source .env/bin/activate
           python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+          python -c "import torch; print('Cuda version:', torch.version.cuda)"
+          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
           python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
 
       - name: Run all tests on GPU
         env:
-          OMP_NUM_THREADS: 1
+          OMP_NUM_THREADS: 16
+          MKL_NUM_THREADS: 16
           RUN_SLOW: yes
+          HF_HOME: /mnt/cache
         run: |
-          source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_gpu tests
+          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_gpu tests
 
       - name: Failure short reports
         if: ${{ always() }}
@@ -78,12 +51,13 @@ jobs:
       - name: Run examples tests on GPU
         if: ${{ always() }}
         env:
-          OMP_NUM_THREADS: 1
+          OMP_NUM_THREADS: 16
+          MKL_NUM_THREADS: 16
           RUN_SLOW: yes
+          HF_HOME: /mnt/cache
         run: |
-          source .env/bin/activate
           pip install -r examples/_tests_requirements.txt
-          python -m pytest -n 1 --dist=loadfile -s --make-reports=examples_torch_gpu examples
+          python -m pytest -n 1 --dist=loadfile --make-reports=examples_torch_gpu examples
 
       - name: Failure short reports
         if: ${{ always() }}
@@ -92,13 +66,13 @@ jobs:
       - name: Run all pipeline tests on GPU
         if: ${{ always() }}
         env:
-          TF_FORCE_GPU_ALLOW_GROWTH: "true"
-          OMP_NUM_THREADS: 1
+          OMP_NUM_THREADS: 16
+          MKL_NUM_THREADS: 16
           RUN_SLOW: yes
           RUN_PIPELINE_TESTS: yes
+          HF_HOME: /mnt/cache
         run: |
-          source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
+          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
 
       - name: Failure short reports
         if: ${{ always() }}
@@ -111,64 +85,39 @@ jobs:
           name: run_all_tests_torch_gpu_test_reports
           path: reports
 
-
   run_all_tests_tf_gpu:
-    runs-on: [self-hosted, gpu, single-gpu]
+    runs-on: [self-hosted, docker-gpu, single-gpu]
+    container:
+      image: tensorflow/tensorflow:2.4.1-gpu
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     steps:
-      - uses: actions/checkout@v2
+      - name: Launcher docker
+        uses: actions/checkout@v2
 
-      - name: Loading cache.
-        uses: actions/cache@v2
-        id: cache
-        with:
-          path: .env
-          key: v1.2-slow_tests_tf_gpu-${{ hashFiles('setup.py') }}
-
-      - name: Python version
+      - name: NVIDIA-SMI
         run: |
-          which python
-          python --version
-          pip --version
-
-      - name: Current dir
-        run: pwd
-
-      - run: nvidia-smi
-
-      - name: Kill any run-away pytest processes
-        run: (pkill -f tests; pkill -f examples) || echo "no zombies"
-
-
-      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
-        if: steps.cache.outputs.cache-hit != 'true'
-        run: |
-          python -m venv .env
-          source .env/bin/activate
-          which python
-          python --version
-          pip --version
+          nvidia-smi
 
       - name: Install dependencies
         run: |
-          source .env/bin/activate
           pip install --upgrade pip
-          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
-          pip install git+https://github.com/huggingface/datasets
-          pip list
+          pip install .[sklearn,testing,onnx,sentencepiece]
 
       - name: Are GPUs recognized by our DL frameworks
         run: |
-          source .env/bin/activate
           TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
           TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
 
       - name: Run all tests on GPU
         env:
-          OMP_NUM_THREADS: 1
           RUN_SLOW: yes
+          HF_HOME: /mnt/cache
+          OMP_NUM_THREADS: 16
+          TF_NUM_INTEROP_THREADS: 1
+          TF_NUM_INTRAOP_THREADS: 16
+          MKL_NUM_THREADS: 16
         run: |
-          source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_gpu tests
+          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu tests
 
       - name: Failure short reports
         if: ${{ always() }}
@@ -177,17 +126,19 @@ jobs:
       - name: Run all pipeline tests on GPU
         if: ${{ always() }}
         env:
-          TF_FORCE_GPU_ALLOW_GROWTH: "true"
-          OMP_NUM_THREADS: 1
           RUN_SLOW: yes
+          HF_HOME: /mnt/cache
+          OMP_NUM_THREADS: 16
           RUN_PIPELINE_TESTS: yes
+          TF_NUM_INTEROP_THREADS: 1
+          TF_NUM_INTRAOP_THREADS: 16
+          MKL_NUM_THREADS: 16
         run: |
-          source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipelines_gpu tests
+          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_gpu tests
 
       - name: Failure short reports
         if: ${{ always() }}
-        run: cat reports/tests_tf_pipelines_gpu_failures_short.txt
+        run: cat reports/tests_tf_pipeline_gpu_failures_short.txt
 
       - name: Test suite reports artifacts
         if: ${{ always() }}
@@ -197,92 +148,55 @@ jobs:
           path: reports
 
   run_all_tests_torch_multi_gpu:
-    runs-on: [self-hosted, gpu, multi-gpu]
+    runs-on: [self-hosted, docker-gpu, multi-gpu]
+    container:
+      image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     steps:
-      - uses: actions/checkout@v2
+      - name: Launcher docker
+        uses: actions/checkout@v2
 
-      - name: Loading cache.
-        uses: actions/cache@v2
-        id: cache
-        with:
-          path: .env
-          key: v1.2-slow_tests_torch_multi_gpu-${{ hashFiles('setup.py') }}
-
-      - name: Python version
+      - name: NVIDIA-SMI
         run: |
-          which python
-          python --version
-          pip --version
-
-      - name: Current dir
-        run: pwd
-
-      - run: nvidia-smi
-
-      - name: Kill any run-away pytest processes
-        run: (pkill -f tests; pkill -f examples) || echo "no zombies"
-
-      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
-        if: steps.cache.outputs.cache-hit != 'true'
-        run: |
-          python -m venv .env
-          source .env/bin/activate
-          which python
-          python --version
-          pip --version
+          nvidia-smi
 
       - name: Install dependencies
         run: |
-          source .env/bin/activate
+          apt -y update && apt install -y libsndfile1-dev
           pip install --upgrade pip
-          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
-          pip install git+https://github.com/huggingface/datasets
-          pip install fairscale
-          pip install deepspeed
-          pip list
+          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
 
       - name: Are GPUs recognized by our DL frameworks
         run: |
-          source .env/bin/activate
           python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+          python -c "import torch; print('Cuda version:', torch.version.cuda)"
+          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
           python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
 
-      - name: Run all tests on multi-GPU
+      - name: Run all tests on GPU
         env:
-          OMP_NUM_THREADS: 1
           RUN_SLOW: yes
+          HF_HOME: /mnt/cache
+          OMP_NUM_THREADS: 16
+          MKL_NUM_THREADS: 16
+          MKL_SERVICE_FORCE_INTEL: 1
         run: |
-          source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_multi_gpu tests
+          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_multi_gpu tests
 
       - name: Failure short reports
         if: ${{ always() }}
         run: cat reports/tests_torch_multi_gpu_failures_short.txt
 
-      - name: Run examples tests on multi-GPU
+      - name: Run all pipeline tests on GPU
         if: ${{ always() }}
         env:
-          OMP_NUM_THREADS: 1
-          RUN_SLOW: yes
-        run: |
-          source .env/bin/activate
-          pip install -r examples/_tests_requirements.txt
-          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_examples_multi_gpu examples
-
-      - name: Failure short reports
-        if: ${{ always() }}
-        run: cat reports/tests_torch_examples_multi_gpu_failures_short.txt
-
-      - name: Run all pipeline tests on multi-GPU
-        if: ${{ always() }}
-        env:
-          TF_FORCE_GPU_ALLOW_GROWTH: "true"
-          OMP_NUM_THREADS: 1
+          OMP_NUM_THREADS: 16
+          MKL_NUM_THREADS: 16
           RUN_SLOW: yes
           RUN_PIPELINE_TESTS: yes
+          HF_HOME: /mnt/cache
         run: |
-          source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
+          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
 
       - name: Failure short reports
         if: ${{ always() }}
@@ -296,76 +210,55 @@ jobs:
           path: reports
 
   run_all_tests_tf_multi_gpu:
-    runs-on: [self-hosted, gpu, multi-gpu]
+    runs-on: [self-hosted, docker-gpu, multi-gpu]
+    container:
+      image: tensorflow/tensorflow:2.4.1-gpu
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     steps:
-      - uses: actions/checkout@v2
+      - name: Launcher docker
+        uses: actions/checkout@v2
 
-      - name: Loading cache.
-        uses: actions/cache@v2
-        id: cache
-        with:
-          path: .env
-          key: v1.2-slow_tests_tf_multi_gpu-${{ hashFiles('setup.py') }}
-
-      - name: Python version
+      - name: NVIDIA-SMI
         run: |
-          which python
-          python --version
-          pip --version
-
-      - name: Current dir
-        run: pwd
-
-      - run: nvidia-smi
-
-      - name: Kill any run-away pytest processes
-        run: (pkill -f tests; pkill -f examples) || echo "no zombies"
-
-      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
-        if: steps.cache.outputs.cache-hit != 'true'
-        run: |
-          python -m venv .env
-          source .env/bin/activate
-          which python
-          python --version
-          pip --version
+          nvidia-smi
 
       - name: Install dependencies
         run: |
-          source .env/bin/activate
           pip install --upgrade pip
-          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
-          pip install git+https://github.com/huggingface/datasets
-          pip list
+          pip install .[sklearn,testing,onnx,sentencepiece]
 
       - name: Are GPUs recognized by our DL frameworks
         run: |
-          source .env/bin/activate
           TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
           TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
 
-      - name: Run all tests on multi-GPU
+      - name: Run all tests on GPU
         env:
-          OMP_NUM_THREADS: 1
+          OMP_NUM_THREADS: 16
           RUN_SLOW: yes
+          MKL_NUM_THREADS: 16
+          TF_NUM_INTEROP_THREADS: 1
+          TF_NUM_INTRAOP_THREADS: 16
+          HF_HOME: /mnt/cache
         run: |
-          source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_multi_gpu tests
+          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu tests
 
       - name: Failure short reports
         if: ${{ always() }}
         run: cat reports/tests_tf_multi_gpu_failures_short.txt
 
-      - name: Run all pipeline tests on multi-GPU
+      - name: Run all pipeline tests on GPU
         if: ${{ always() }}
         env:
-          TF_FORCE_GPU_ALLOW_GROWTH: "true"
-          OMP_NUM_THREADS: 1
+          OMP_NUM_THREADS: 16
           RUN_SLOW: yes
           RUN_PIPELINE_TESTS: yes
+          MKL_NUM_THREADS: 16
+          TF_NUM_INTEROP_THREADS: 1
+          TF_NUM_INTRAOP_THREADS: 16
+          HF_HOME: /mnt/cache
         run: |
-          source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests
+          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests
 
       - name: Failure short reports
         if: ${{ always() }}
@@ -377,3 +270,23 @@ jobs:
         with:
           name: run_all_tests_tf_multi_gpu_test_reports
           path: reports
+
+  send_results:
+    name: Send results to webhook
+    runs-on: ubuntu-latest
+    if: always()
+    needs: [run_all_tests_torch_gpu, run_all_tests_tf_gpu, run_all_tests_torch_multi_gpu, run_all_tests_tf_multi_gpu]
+    steps:
+      - uses: actions/checkout@v2
+
+      - uses: actions/download-artifact@v2
+
+      - name: Send message to Slack
+        env:
+          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
+          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
+
+
+        run: |
+          pip install slack_sdk
+          python utils/notification_service.py scheduled
diff --git a/setup.py b/setup.py
index 7903198180..16567d71c0 100644
--- a/setup.py
+++ b/setup.py
@@ -115,6 +115,7 @@ _deps = [
     "psutil",
     "pydantic",
     "pytest",
+    "pytest-sugar",
     "pytest-xdist",
     "python>=3.6.0",
     "recommonmark",
@@ -225,6 +226,7 @@ else:
 
 extras["tokenizers"] = deps_list("tokenizers")
 extras["onnxruntime"] = deps_list("onnxruntime", "onnxruntime-tools")
+extras["onnx"] = deps_list("onnxconverter-common", "keras2onnx") + extras["onnxruntime"]
 extras["modelcreation"] = deps_list("cookiecutter")
 
 extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
@@ -232,7 +234,7 @@ extras["speech"] = deps_list("soundfile", "torchaudio")
 
 extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
 extras["testing"] = (
-    deps_list("pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets")
+    deps_list("pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-sugar")
     + extras["retrieval"]
     + extras["modelcreation"]
 )
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 6022ac220b..576fbe7cd6 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -28,6 +28,7 @@ deps = {
     "psutil": "psutil",
     "pydantic": "pydantic",
     "pytest": "pytest",
+    "pytest-sugar": "pytest-sugar",
     "pytest-xdist": "pytest-xdist",
     "python": "python>=3.6.0",
     "recommonmark": "recommonmark",
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 13838fab40..063aba5553 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -137,6 +137,17 @@ def slow(test_case):
         return test_case
 
 
+def tooslow(test_case):
+    """
+    Decorator marking a test as too slow.
+
+    Slow tests are skipped while they're in the process of being fixed. No test should stay tagged as "tooslow" as
+    these will not be tested by the CI.
+
+    """
+    return unittest.skip("test is too slow")(test_case)
+
+
 def custom_tokenizers(test_case):
     """
     Decorator marking a test for a custom tokenizer.
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 6f66350a9c..a2f7085660 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -25,7 +25,14 @@ from importlib import import_module
 from typing import List, Tuple
 
 from transformers import is_tf_available
-from transformers.testing_utils import _tf_gpu_memory_limit, is_pt_tf_cross_test, require_onnx, require_tf, slow
+from transformers.testing_utils import (
+    _tf_gpu_memory_limit,
+    is_pt_tf_cross_test,
+    require_onnx,
+    require_tf,
+    slow,
+    tooslow,
+)
 
 
 if is_tf_available():
@@ -129,7 +136,7 @@ class TFModelTesterMixin:
 
                 self.assert_outputs_same(after_outputs, outputs)
 
-    @slow
+    @tooslow
     def test_graph_mode(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
@@ -143,7 +150,7 @@ class TFModelTesterMixin:
             outputs = run_in_graph_mode()
             self.assertIsNotNone(outputs)
 
-    @slow
+    @tooslow
     def test_xla_mode(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
@@ -184,7 +191,7 @@ class TFModelTesterMixin:
                 expected_arg_names = ["input_ids"]
                 self.assertListEqual(arg_names[:1], expected_arg_names)
 
-    @slow
+    @tooslow
     def test_saved_model_creation(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.output_hidden_states = False
@@ -205,7 +212,7 @@ class TFModelTesterMixin:
             saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
             self.assertTrue(os.path.exists(saved_model_dir))
 
-    @slow
+    @tooslow
     def test_saved_model_creation_extended(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.output_hidden_states = True
@@ -314,7 +321,7 @@ class TFModelTesterMixin:
 
             onnxruntime.InferenceSession(onnx_model.SerializeToString())
 
-    @slow
+    @tooslow
     def test_mixed_precision(self):
         tf.keras.mixed_precision.experimental.set_policy("mixed_float16")
 
@@ -488,7 +495,7 @@ class TFModelTesterMixin:
             max_diff = np.amax(np.abs(tfo - pto))
             self.assertLessEqual(max_diff, 4e-2)
 
-    @slow
+    @tooslow
     def test_train_pipeline_custom_model(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         # head_mask and decoder_head_mask has different shapes than other input args
@@ -909,7 +916,7 @@ class TFModelTesterMixin:
 
             model(inputs)
 
-    @slow
+    @tooslow
     def test_graph_mode_with_inputs_embeds(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/utils/notification_service.py b/utils/notification_service.py
new file mode 100644
index 0000000000..fb3fdebcf8
--- /dev/null
+++ b/utils/notification_service.py
@@ -0,0 +1,185 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import sys
+
+from slack_sdk import WebClient
+
+
+def handle_test_results(test_results):
+    expressions = test_results.split(" ")
+
+    failed = 0
+    success = 0
+
+    # When the output is short enough, the output is surrounded by = signs: "== OUTPUT =="
+    # When it is too long, those signs are not present.
+    time_spent = expressions[-2] if "=" in expressions[-1] else expressions[-1]
+
+    for i, expression in enumerate(expressions):
+        if "failed" in expression:
+            failed += int(expressions[i - 1])
+        if "passed" in expression:
+            success += int(expressions[i - 1])
+
+    return failed, success, time_spent
+
+
+def format_for_slack(total_results, results, scheduled: bool):
+    print(results)
+    header = {
+        "type": "header",
+        "text": {
+            "type": "plain_text",
+            "text": "🤗 Results of the scheduled tests, March 11, 2021." if scheduled else "🤗 Self-push results",
+            "emoji": True,
+        },
+    }
+
+    total = (
+        {
+            "type": "section",
+            "fields": [
+                {"type": "mrkdwn", "text": f"*Failures:*\n❌ {total_results['failed']} failures."},
+                {"type": "mrkdwn", "text": f"*Passed:*\n✅ {total_results['success']} tests passed."},
+            ],
+        }
+        if total_results["failed"] > 0
+        else {
+            "type": "section",
+            "fields": [{"type": "mrkdwn", "text": f"*Congrats!*\nAll {total_results['success']} tests pass."}],
+        }
+    )
+
+    blocks = [header, total]
+
+    if total_results["failed"] > 0:
+        for key, result in results.items():
+            print(key, result)
+            blocks.append({"type": "header", "text": {"type": "plain_text", "text": key, "emoji": True}})
+            blocks.append(
+                {
+                    "type": "section",
+                    "fields": [
+                        {
+                            "type": "mrkdwn",
+                            "text": f"*Results:*\n{result['failed']} failed, {result['success']} passed.",
+                        },
+                        {"type": "mrkdwn", "text": f"*Time spent:*\n{result['time_spent']}"},
+                    ],
+                }
+            )
+    else:
+        for key, result in results.items():
+            blocks.append(
+                {"type": "section", "fields": [{"type": "mrkdwn", "text": f"*{key}*\n{result['time_spent']}."}]}
+            )
+
+    footer = {
+        "type": "section",
+        "text": {
+            "type": "mrkdwn",
+            "text": "<https://github.com/huggingface/transformers/actions/workflows/self-scheduled.yml|View on GitHub>"
+            if scheduled
+            else "<https://github.com/huggingface/transformers/actions/workflows/self-push.yml|View on GitHub>",
+        },
+    }
+
+    blocks.append(footer)
+
+    blocks = {"blocks": blocks}
+
+    return blocks
+
+
+if __name__ == "__main__":
+    scheduled = sys.argv[1] == "scheduled"
+
+    if scheduled:
+        # The scheduled run has several artifacts for each job.
+        file_paths = {
+            "TF Single GPU": {
+                "common": "run_all_tests_tf_gpu_test_reports/tests_tf_gpu_[].txt",
+                "pipeline": "run_all_tests_tf_gpu_test_reports/tests_tf_pipeline_gpu_[].txt",
+            },
+            "Torch Single GPU": {
+                "common": "run_all_tests_torch_gpu_test_reports/tests_torch_gpu_[].txt",
+                "pipeline": "run_all_tests_torch_gpu_test_reports/tests_torch_pipeline_gpu_[].txt",
+                "examples": "run_all_tests_torch_gpu_test_reports/examples_torch_gpu_[].txt",
+            },
+            "TF Multi GPU": {
+                "common": "run_all_tests_tf_multi_gpu_test_reports/tests_tf_multi_gpu_[].txt",
+                "pipeline": "run_all_tests_tf_multi_gpu_test_reports/tests_tf_pipeline_multi_gpu_[].txt",
+            },
+            "Torch Multi GPU": {
+                "common": "run_all_tests_torch_multi_gpu_test_reports/tests_torch_multi_gpu_[].txt",
+                "pipeline": "run_all_tests_torch_multi_gpu_test_reports/tests_torch_pipeline_multi_gpu_[].txt",
+            },
+        }
+    else:
+        file_paths = {
+            "TF Single GPU": {"common": "run_all_tests_tf_gpu_test_reports/tests_tf_gpu_[].txt"},
+            "Torch Single GPU": {"common": "run_all_tests_torch_gpu_test_reports/tests_torch_gpu_[].txt"},
+            "TF Multi GPU": {"common": "run_all_tests_tf_multi_gpu_test_reports/tests_tf_multi_gpu_[].txt"},
+            "Torch Multi GPU": {"common": "run_all_tests_torch_multi_gpu_test_reports/tests_torch_multi_gpu_[].txt"},
+        }
+
+    client = WebClient(token=os.environ["CI_SLACK_BOT_TOKEN"])
+    channel_id = os.environ["CI_SLACK_CHANNEL_ID"]
+
+    try:
+        results = {}
+        for job, file_dict in file_paths.items():
+
+            # Single return value for failed/success across steps of a same job
+            results[job] = {"failed": 0, "success": 0, "time_spent": "", "failures": ""}
+
+            for key, file_path in file_dict.items():
+                with open(file_path.replace("[]", "stats")) as f:
+                    failed, success, time_spent = handle_test_results(f.read())
+                    results[job]["failed"] += failed
+                    results[job]["success"] += success
+                    results[job]["time_spent"] += time_spent[1:-1] + ", "
+                with open(file_path.replace("[]", "summary_short")) as f:
+                    for line in f:
+                        if re.search("FAILED", line):
+                            results[job]["failures"] += line
+
+            # Remove the trailing ", "
+            results[job]["time_spent"] = results[job]["time_spent"][:-2]
+
+        test_results_keys = ["failed", "success"]
+        total = {"failed": 0, "success": 0}
+        for job, job_result in results.items():
+            for result_key in test_results_keys:
+                total[result_key] += job_result[result_key]
+
+        to_be_sent_to_slack = format_for_slack(total, results, scheduled)
+
+        result = client.chat_postMessage(
+            channel=channel_id,
+            blocks=to_be_sent_to_slack["blocks"],
+        )
+
+        for job, job_result in results.items():
+            if len(job_result["failures"]):
+                client.chat_postMessage(
+                    channel=channel_id, text=f"{job}\n{job_result['failures']}", thread_ts=result["ts"]
+                )
+
+    except Exception as e:
+        # Voluntarily catch every exception and send it to Slack.
+        raise Exception(f"Setup error: no artifacts were found. Error: {e}") from e