Multi-GPU setup (#7453)

2020-09-30 11:53:34 +02:00
parent 4ba248748f
commit 056723ad1d
2 changed files with 114 additions and 2 deletions
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -14,7 +14,7 @@ on:
 jobs:
  run_tests_torch_and_tf_gpu:
-    runs-on: self-hosted
+    runs-on: [self-hosted, single-gpu]
    steps:
    - uses: actions/checkout@v2
    - name: Python version
@@ -62,3 +62,53 @@ jobs:
      run: |
        source .env/bin/activate
        python -m pytest -n 2 --dist=loadfile -s ./tests/
  run_tests_torch_and_tf_multiple_gpu:
    runs-on: [self-hosted, multi-gpu]
    steps:
      - uses: actions/checkout@v2
      - name: Python version
        run: |
          which python
          python --version
          pip --version
      - name: Current dir
        run: pwd
      - run: nvidia-smi
      - name: Loading cache.
        uses: actions/cache@v2
        id: cache
        with:
          path: .env
          key: v0-tests_tf_torch_multiple_gpu-${{ hashFiles('setup.py') }}
      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
        run: |
          python -m venv .env
          source .env/bin/activate
          which python
          python --version
          pip --version
      - name: Install dependencies
        run: |
          source .env/bin/activate
          pip install --upgrade pip
          pip install torch!=1.6.0
          pip install .[sklearn,testing,onnxruntime]
          pip install git+https://github.com/huggingface/datasets
      - name: Are GPUs recognized by our DL frameworks
        run: |
          source .env/bin/activate
          python -c "import torch; print(torch.cuda.is_available())"
      - name: Run all non-slow tests on GPU
        env:
          TF_FORCE_GPU_ALLOW_GROWTH: "true"
          # TF_GPU_MEMORY_LIMIT: 4096
          OMP_NUM_THREADS: 1
          USE_CUDA: yes
        run: |
          source .env/bin/activate
          python -m pytest -n 2 --dist=loadfile -s ./tests/
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -10,7 +10,7 @@ on:
 jobs:
  run_all_tests_torch_and_tf_gpu:
-    runs-on: self-hosted
+    runs-on: [self-hosted, single-gpu]
    steps:
    - uses: actions/checkout@v2
@@ -70,3 +70,65 @@ jobs:
        source .env/bin/activate
        pip install -r examples/requirements.txt
        python -m pytest -n 1 --dist=loadfile -s examples
  run_all_tests_torch_and_tf_multiple_gpu:
    runs-on: [self-hosted, multi-gpu]
    steps:
      - uses: actions/checkout@v2
      - name: Loading cache.
        uses: actions/cache@v2
        id: cache
        with:
          path: .env
          key: v0-slow_tests_tf_torch_multi_gpu-${{ hashFiles('setup.py') }}
      - name: Python version
        run: |
          which python
          python --version
          pip --version
      - name: Current dir
        run: pwd
      - run: nvidia-smi
      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
        if: steps.cache.outputs.cache-hit != 'true'
        run: |
          python -m venv .env
          source .env/bin/activate
          which python
          python --version
          pip --version
      - name: Install dependencies
        run: |
          source .env/bin/activate
          pip install --upgrade pip
          pip install torch!=1.6.0
          pip install .[sklearn,testing,onnxruntime]
          pip install git+https://github.com/huggingface/datasets
      - name: Are GPUs recognized by our DL frameworks
        run: |
          source .env/bin/activate
          python -c "import torch; print(torch.cuda.is_available())"
      - name: Run all tests on GPU
        env:
          TF_FORCE_GPU_ALLOW_GROWTH: "true"
          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
          USE_CUDA: yes
        run: |
          source .env/bin/activate
          python -m pytest -n 1 --dist=loadfile -s ./tests/
      - name: Run examples tests on GPU
        env:
          TF_FORCE_GPU_ALLOW_GROWTH: "true"
          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
          USE_CUDA: yes
        run: |
          source .env/bin/activate
          pip install -r examples/requirements.txt
          python -m pytest -n 1 --dist=loadfile -s examples