From 056723ad1d1666a76cb80b4883894e83636aa5cf Mon Sep 17 00:00:00 2001
From: Lysandre Debut <lysandre@huggingface.co>
Date: Wed, 30 Sep 2020 11:53:34 +0200
Subject: [PATCH] Multi-GPU setup (#7453)

---
 .github/workflows/self-push.yml      | 52 +++++++++++++++++++++-
 .github/workflows/self-scheduled.yml | 64 +++++++++++++++++++++++++++-
 2 files changed, 114 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml
index 6e3f368cb7..5ac040b920 100644
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -14,7 +14,7 @@ on:
 
 jobs:
   run_tests_torch_and_tf_gpu:
-    runs-on: self-hosted
+    runs-on: [self-hosted, single-gpu]
     steps:
     - uses: actions/checkout@v2
     - name: Python version
@@ -62,3 +62,53 @@ jobs:
       run: |
         source .env/bin/activate
         python -m pytest -n 2 --dist=loadfile -s ./tests/
+
+  run_tests_torch_and_tf_multiple_gpu:
+    runs-on: [self-hosted, multi-gpu]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Python version
+        run: |
+          which python
+          python --version
+          pip --version
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v0-tests_tf_torch_multiple_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version
+      - name: Install dependencies
+        run: |
+          source .env/bin/activate
+          pip install --upgrade pip
+          pip install torch!=1.6.0
+          pip install .[sklearn,testing,onnxruntime]
+          pip install git+https://github.com/huggingface/datasets
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          source .env/bin/activate
+          python -c "import torch; print(torch.cuda.is_available())"
+
+      - name: Run all non-slow tests on GPU
+        env:
+          TF_FORCE_GPU_ALLOW_GROWTH: "true"
+          # TF_GPU_MEMORY_LIMIT: 4096
+          OMP_NUM_THREADS: 1
+          USE_CUDA: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 2 --dist=loadfile -s ./tests/
diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index 231fab7a95..e70be8cd09 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -10,7 +10,7 @@ on:
 
 jobs:
   run_all_tests_torch_and_tf_gpu:
-    runs-on: self-hosted
+    runs-on: [self-hosted, single-gpu]
     steps:
     - uses: actions/checkout@v2
 
@@ -70,3 +70,65 @@ jobs:
         source .env/bin/activate
         pip install -r examples/requirements.txt
         python -m pytest -n 1 --dist=loadfile -s examples
+
+  run_all_tests_torch_and_tf_multiple_gpu:
+    runs-on: [self-hosted, multi-gpu]
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v0-slow_tests_tf_torch_multi_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Python version
+        run: |
+          which python
+          python --version
+          pip --version
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        if: steps.cache.outputs.cache-hit != 'true'
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version
+      - name: Install dependencies
+        run: |
+          source .env/bin/activate
+          pip install --upgrade pip
+          pip install torch!=1.6.0
+          pip install .[sklearn,testing,onnxruntime]
+          pip install git+https://github.com/huggingface/datasets
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          source .env/bin/activate
+          python -c "import torch; print(torch.cuda.is_available())"
+
+      - name: Run all tests on GPU
+        env:
+          TF_FORCE_GPU_ALLOW_GROWTH: "true"
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+          USE_CUDA: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s ./tests/
+
+      - name: Run examples tests on GPU
+        env:
+          TF_FORCE_GPU_ALLOW_GROWTH: "true"
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+          USE_CUDA: yes
+        run: |
+          source .env/bin/activate
+          pip install -r examples/requirements.txt
+          python -m pytest -n 1 --dist=loadfile -s examples