From 056723ad1d1666a76cb80b4883894e83636aa5cf Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Wed, 30 Sep 2020 11:53:34 +0200 Subject: [PATCH] Multi-GPU setup (#7453) --- .github/workflows/self-push.yml | 52 +++++++++++++++++++++- .github/workflows/self-scheduled.yml | 64 +++++++++++++++++++++++++++- 2 files changed, 114 insertions(+), 2 deletions(-) diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml index 6e3f368cb7..5ac040b920 100644 --- a/.github/workflows/self-push.yml +++ b/.github/workflows/self-push.yml @@ -14,7 +14,7 @@ on: jobs: run_tests_torch_and_tf_gpu: - runs-on: self-hosted + runs-on: [self-hosted, single-gpu] steps: - uses: actions/checkout@v2 - name: Python version @@ -62,3 +62,53 @@ jobs: run: | source .env/bin/activate python -m pytest -n 2 --dist=loadfile -s ./tests/ + + run_tests_torch_and_tf_multiple_gpu: + runs-on: [self-hosted, multi-gpu] + steps: + - uses: actions/checkout@v2 + - name: Python version + run: | + which python + python --version + pip --version + - name: Current dir + run: pwd + - run: nvidia-smi + + - name: Loading cache. + uses: actions/cache@v2 + id: cache + with: + path: .env + key: v0-tests_tf_torch_multiple_gpu-${{ hashFiles('setup.py') }} + + - name: Create new python env (on self-hosted runners we have to handle isolation ourselves) + run: | + python -m venv .env + source .env/bin/activate + which python + python --version + pip --version + - name: Install dependencies + run: | + source .env/bin/activate + pip install --upgrade pip + pip install torch!=1.6.0 + pip install .[sklearn,testing,onnxruntime] + pip install git+https://github.com/huggingface/datasets + + - name: Are GPUs recognized by our DL frameworks + run: | + source .env/bin/activate + python -c "import torch; print(torch.cuda.is_available())" + + - name: Run all non-slow tests on GPU + env: + TF_FORCE_GPU_ALLOW_GROWTH: "true" + # TF_GPU_MEMORY_LIMIT: 4096 + OMP_NUM_THREADS: 1 + USE_CUDA: yes + run: | + source .env/bin/activate + python -m pytest -n 2 --dist=loadfile -s ./tests/ diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 231fab7a95..e70be8cd09 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -10,7 +10,7 @@ on: jobs: run_all_tests_torch_and_tf_gpu: - runs-on: self-hosted + runs-on: [self-hosted, single-gpu] steps: - uses: actions/checkout@v2 @@ -70,3 +70,65 @@ jobs: source .env/bin/activate pip install -r examples/requirements.txt python -m pytest -n 1 --dist=loadfile -s examples + + run_all_tests_torch_and_tf_multiple_gpu: + runs-on: [self-hosted, multi-gpu] + steps: + - uses: actions/checkout@v2 + + - name: Loading cache. + uses: actions/cache@v2 + id: cache + with: + path: .env + key: v0-slow_tests_tf_torch_multi_gpu-${{ hashFiles('setup.py') }} + + - name: Python version + run: | + which python + python --version + pip --version + - name: Current dir + run: pwd + - run: nvidia-smi + - name: Create new python env (on self-hosted runners we have to handle isolation ourselves) + if: steps.cache.outputs.cache-hit != 'true' + run: | + python -m venv .env + source .env/bin/activate + which python + python --version + pip --version + - name: Install dependencies + run: | + source .env/bin/activate + pip install --upgrade pip + pip install torch!=1.6.0 + pip install .[sklearn,testing,onnxruntime] + pip install git+https://github.com/huggingface/datasets + + - name: Are GPUs recognized by our DL frameworks + run: | + source .env/bin/activate + python -c "import torch; print(torch.cuda.is_available())" + + - name: Run all tests on GPU + env: + TF_FORCE_GPU_ALLOW_GROWTH: "true" + OMP_NUM_THREADS: 1 + RUN_SLOW: yes + USE_CUDA: yes + run: | + source .env/bin/activate + python -m pytest -n 1 --dist=loadfile -s ./tests/ + + - name: Run examples tests on GPU + env: + TF_FORCE_GPU_ALLOW_GROWTH: "true" + OMP_NUM_THREADS: 1 + RUN_SLOW: yes + USE_CUDA: yes + run: | + source .env/bin/activate + pip install -r examples/requirements.txt + python -m pytest -n 1 --dist=loadfile -s examples