Run CI on deepspeed and fairscale (#11172)

* Run CI on deepspeed and fairscale * Test it on this branch :) * Rename * Update the CI image
2021-04-13 15:47:06 -04:00
parent f38cd4373f
commit 1ad7b0398c
2 changed files with 150 additions and 2 deletions
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -5,6 +5,7 @@ on:
    branches:
      - master
      - ci_*
      - ci-*
    paths:
      - "src/**"
      - "tests/**"
@@ -186,11 +187,85 @@ jobs:
          name: run_all_tests_tf_multi_gpu_test_reports
          path: reports
  run_tests_torch_cuda_extensions_gpu:
    runs-on: [self-hosted, docker-gpu, single-gpu]
    container:
      image: nvcr.io/nvidia/pytorch:21.03-py3
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Launcher docker
        uses: actions/checkout@v2
      - name: NVIDIA-SMI
        run: |
          nvidia-smi
      - name: Install dependencies
        run: |
          pip install --upgrade pip
          pip install .[testing,deepspeed]
      - name: Are GPUs recognized by our DL frameworks
        run: |
          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
          python -c "import torch; print('Cuda version:', torch.version.cuda)"
          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
      - name: Run all tests on GPU
        run: |
          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt
  run_tests_torch_cuda_extensions_multi_gpu:
    runs-on: [self-hosted, docker-gpu, multi-gpu]
    container:
      image: nvcr.io/nvidia/pytorch:21.03-py3
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Launcher docker
        uses: actions/checkout@v2
      - name: NVIDIA-SMI
        run: |
          nvidia-smi
      - name: Install dependencies
        run: |
          pip install --upgrade pip
          pip install .[testing,deepspeed,fairscale]
      - name: Are GPUs recognized by our DL frameworks
        run: |
          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
          python -c "import torch; print('Cuda version:', torch.version.cuda)"
          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
      - name: Run all tests on GPU
        run: |
          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt
  send_results:
    name: Send results to webhook
    runs-on: ubuntu-latest
    if: always()
-    needs: [run_tests_torch_gpu, run_tests_tf_gpu, run_tests_torch_multi_gpu, run_tests_tf_multi_gpu]
+    needs: [
        run_tests_torch_gpu,
        run_tests_tf_gpu,
        run_tests_torch_multi_gpu,
        run_tests_tf_multi_gpu,
        run_tests_torch_cuda_extensions_gpu,
        run_tests_torch_cuda_extensions_multi_gpu
    ]
    steps:
      - uses: actions/checkout@v2
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -246,11 +246,84 @@ jobs:
          name: run_all_tests_tf_multi_gpu_test_reports
          path: reports
  run_all_tests_torch_cuda_extensions_gpu:
    runs-on: [self-hosted, docker-gpu, single-gpu]
    container:
      image: nvcr.io/nvidia/pytorch:21.03-py3
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Launcher docker
        uses: actions/checkout@v2
      - name: NVIDIA-SMI
        run: |
          nvidia-smi
      - name: Install dependencies
        run: |
          pip install --upgrade pip
          pip install .[testing,deepspeed]
      - name: Are GPUs recognized by our DL frameworks
        run: |
          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
          python -c "import torch; print('Cuda version:', torch.version.cuda)"
          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
      - name: Run all tests on GPU
        run: |
          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt
  run_all_tests_torch_cuda_extensions_multi_gpu:
    runs-on: [self-hosted, docker-gpu, multi-gpu]
    container:
      image: nvcr.io/nvidia/pytorch:21.03-py3
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Launcher docker
        uses: actions/checkout@v2
      - name: NVIDIA-SMI
        run: |
          nvidia-smi
      - name: Install dependencies
        run: |
          pip install --upgrade pip
          pip install .[testing,deepspeed,fairscale]
      - name: Are GPUs recognized by our DL frameworks
        run: |
          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
          python -c "import torch; print('Cuda version:', torch.version.cuda)"
          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
      - name: Run all tests on GPU
        run: |
          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt
  send_results:
    name: Send results to webhook
    runs-on: ubuntu-latest
    if: always()
-    needs: [run_all_tests_torch_gpu, run_all_tests_tf_gpu, run_all_tests_torch_multi_gpu, run_all_tests_tf_multi_gpu]
+    needs: [
        run_all_tests_torch_gpu,
        run_all_tests_tf_gpu,
        run_all_tests_torch_multi_gpu,
        run_all_tests_tf_multi_gpu,
        run_all_tests_torch_cuda_extensions_gpu,
        run_all_tests_torch_cuda_extensions_multi_gpu
    ]
    steps:
      - uses: actions/checkout@v2