Use latest stable PyTorch/DeepSpeed for Push & Scheduled CI (#17417)

* update versions

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
Yih-Dar
2022-06-07 11:53:05 +02:00
committed by GitHub
parent ad71965246
commit 9aa230aa2f
5 changed files with 27 additions and 39 deletions

View File

@@ -198,19 +198,12 @@ jobs:
machine_type: [single-gpu]
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
container:
image: nvcr.io/nvidia/pytorch:21.03-py3
image: huggingface/transformers-pytorch-deepspeed-latest-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Checkout transformers
uses: actions/checkout@v2
with:
fetch-depth: 2
- name: Install dependencies
run: |
apt -y update && apt install -y libaio-dev
pip install --upgrade pip
pip install .[deepspeed-testing]
- name: Update clone
working-directory: /workspace/transformers
run: git fetch && git checkout ${{ github.sha }}
- name: NVIDIA-SMI
run: |
@@ -247,30 +240,24 @@ jobs:
machine_type: [multi-gpu]
runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}']
container:
image: nvcr.io/nvidia/pytorch:21.03-py3
image: huggingface/transformers-pytorch-deepspeed-latest-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: Checkout transformers
uses: actions/checkout@v2
with:
fetch-depth: 2
- name: Install dependencies
run: |
apt -y update && apt install -y libaio-dev
pip install --upgrade pip
rm -rf ~/.cache/torch_extensions/ # shared between conflicting builds
pip install .[testing,deepspeed,fairscale]
- name: Update clone
working-directory: /workspace/transformers
run: git fetch && git checkout ${{ github.sha }}
- name: NVIDIA-SMI
run: |
nvidia-smi
- name: Environment
working-directory: /workspace/transformers
run: |
python utils/print_env.py
- name: Run all non-slow selected tests on GPU
working-directory: /workspace/transformers
# TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests.
run: |
python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
@@ -278,14 +265,14 @@ jobs:
- name: Failure short reports
if: ${{ failure() }}
continue-on-error: true
run: cat reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt
- name: Test suite reports artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v2
with:
name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports
path: reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
send_results:
name: Send results to webhook

View File

@@ -306,14 +306,6 @@ jobs:
working-directory: /workspace/transformers
run: git fetch && git checkout ${{ github.sha }}
- name: Re-compile DeepSpeed
working-directory: /workspace
run: |
pip install deepspeed # installs the deps correctly
rm -rf DeepSpeed
git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install -e . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
- name: NVIDIA-SMI
run: |
nvidia-smi