Run CI on deepspeed and fairscale (#11172)
* Run CI on deepspeed and fairscale * Test it on this branch :) * Rename * Update the CI image
This commit is contained in:
77
.github/workflows/self-push.yml
vendored
77
.github/workflows/self-push.yml
vendored
@@ -5,6 +5,7 @@ on:
|
|||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
- ci_*
|
- ci_*
|
||||||
|
- ci-*
|
||||||
paths:
|
paths:
|
||||||
- "src/**"
|
- "src/**"
|
||||||
- "tests/**"
|
- "tests/**"
|
||||||
@@ -186,11 +187,85 @@ jobs:
|
|||||||
name: run_all_tests_tf_multi_gpu_test_reports
|
name: run_all_tests_tf_multi_gpu_test_reports
|
||||||
path: reports
|
path: reports
|
||||||
|
|
||||||
|
run_tests_torch_cuda_extensions_gpu:
|
||||||
|
runs-on: [self-hosted, docker-gpu, single-gpu]
|
||||||
|
container:
|
||||||
|
image: nvcr.io/nvidia/pytorch:21.03-py3
|
||||||
|
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
|
steps:
|
||||||
|
- name: Launcher docker
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
|
||||||
|
- name: NVIDIA-SMI
|
||||||
|
run: |
|
||||||
|
nvidia-smi
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install .[testing,deepspeed]
|
||||||
|
|
||||||
|
- name: Are GPUs recognized by our DL frameworks
|
||||||
|
run: |
|
||||||
|
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
|
||||||
|
python -c "import torch; print('Cuda version:', torch.version.cuda)"
|
||||||
|
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
|
||||||
|
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
|
||||||
|
|
||||||
|
- name: Run all tests on GPU
|
||||||
|
run: |
|
||||||
|
python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
|
||||||
|
|
||||||
|
- name: Failure short reports
|
||||||
|
if: ${{ always() }}
|
||||||
|
run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt
|
||||||
|
|
||||||
|
run_tests_torch_cuda_extensions_multi_gpu:
|
||||||
|
runs-on: [self-hosted, docker-gpu, multi-gpu]
|
||||||
|
container:
|
||||||
|
image: nvcr.io/nvidia/pytorch:21.03-py3
|
||||||
|
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
|
steps:
|
||||||
|
- name: Launcher docker
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
|
||||||
|
- name: NVIDIA-SMI
|
||||||
|
run: |
|
||||||
|
nvidia-smi
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install .[testing,deepspeed,fairscale]
|
||||||
|
|
||||||
|
- name: Are GPUs recognized by our DL frameworks
|
||||||
|
run: |
|
||||||
|
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
|
||||||
|
python -c "import torch; print('Cuda version:', torch.version.cuda)"
|
||||||
|
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
|
||||||
|
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
|
||||||
|
|
||||||
|
- name: Run all tests on GPU
|
||||||
|
run: |
|
||||||
|
python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
|
||||||
|
|
||||||
|
- name: Failure short reports
|
||||||
|
if: ${{ always() }}
|
||||||
|
run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt
|
||||||
|
|
||||||
|
|
||||||
send_results:
|
send_results:
|
||||||
name: Send results to webhook
|
name: Send results to webhook
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
if: always()
|
if: always()
|
||||||
needs: [run_tests_torch_gpu, run_tests_tf_gpu, run_tests_torch_multi_gpu, run_tests_tf_multi_gpu]
|
needs: [
|
||||||
|
run_tests_torch_gpu,
|
||||||
|
run_tests_tf_gpu,
|
||||||
|
run_tests_torch_multi_gpu,
|
||||||
|
run_tests_tf_multi_gpu,
|
||||||
|
run_tests_torch_cuda_extensions_gpu,
|
||||||
|
run_tests_torch_cuda_extensions_multi_gpu
|
||||||
|
]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
|
|
||||||
|
|||||||
75
.github/workflows/self-scheduled.yml
vendored
75
.github/workflows/self-scheduled.yml
vendored
@@ -246,11 +246,84 @@ jobs:
|
|||||||
name: run_all_tests_tf_multi_gpu_test_reports
|
name: run_all_tests_tf_multi_gpu_test_reports
|
||||||
path: reports
|
path: reports
|
||||||
|
|
||||||
|
run_all_tests_torch_cuda_extensions_gpu:
|
||||||
|
runs-on: [self-hosted, docker-gpu, single-gpu]
|
||||||
|
container:
|
||||||
|
image: nvcr.io/nvidia/pytorch:21.03-py3
|
||||||
|
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
|
steps:
|
||||||
|
- name: Launcher docker
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
|
||||||
|
- name: NVIDIA-SMI
|
||||||
|
run: |
|
||||||
|
nvidia-smi
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install .[testing,deepspeed]
|
||||||
|
|
||||||
|
- name: Are GPUs recognized by our DL frameworks
|
||||||
|
run: |
|
||||||
|
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
|
||||||
|
python -c "import torch; print('Cuda version:', torch.version.cuda)"
|
||||||
|
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
|
||||||
|
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
|
||||||
|
|
||||||
|
- name: Run all tests on GPU
|
||||||
|
run: |
|
||||||
|
python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
|
||||||
|
|
||||||
|
- name: Failure short reports
|
||||||
|
if: ${{ always() }}
|
||||||
|
run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt
|
||||||
|
|
||||||
|
run_all_tests_torch_cuda_extensions_multi_gpu:
|
||||||
|
runs-on: [self-hosted, docker-gpu, multi-gpu]
|
||||||
|
container:
|
||||||
|
image: nvcr.io/nvidia/pytorch:21.03-py3
|
||||||
|
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
|
steps:
|
||||||
|
- name: Launcher docker
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
|
||||||
|
- name: NVIDIA-SMI
|
||||||
|
run: |
|
||||||
|
nvidia-smi
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install .[testing,deepspeed,fairscale]
|
||||||
|
|
||||||
|
- name: Are GPUs recognized by our DL frameworks
|
||||||
|
run: |
|
||||||
|
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
|
||||||
|
python -c "import torch; print('Cuda version:', torch.version.cuda)"
|
||||||
|
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
|
||||||
|
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
|
||||||
|
|
||||||
|
- name: Run all tests on GPU
|
||||||
|
run: |
|
||||||
|
python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
|
||||||
|
|
||||||
|
- name: Failure short reports
|
||||||
|
if: ${{ always() }}
|
||||||
|
run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt
|
||||||
|
|
||||||
send_results:
|
send_results:
|
||||||
name: Send results to webhook
|
name: Send results to webhook
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
if: always()
|
if: always()
|
||||||
needs: [run_all_tests_torch_gpu, run_all_tests_tf_gpu, run_all_tests_torch_multi_gpu, run_all_tests_tf_multi_gpu]
|
needs: [
|
||||||
|
run_all_tests_torch_gpu,
|
||||||
|
run_all_tests_tf_gpu,
|
||||||
|
run_all_tests_torch_multi_gpu,
|
||||||
|
run_all_tests_tf_multi_gpu,
|
||||||
|
run_all_tests_torch_cuda_extensions_gpu,
|
||||||
|
run_all_tests_torch_cuda_extensions_multi_gpu
|
||||||
|
]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user