From ca169dbdf189d30b3aacfd7cc50d668121361ec8 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Fri, 17 Jun 2022 16:42:27 +0200 Subject: [PATCH] Enable PyTorch nightly build CI (#17335) * nightly build pytorch CI * fix working dir * change time and event name Co-authored-by: ydshieh --- .github/workflows/build-docker-images.yml | 53 +++ .github/workflows/self-nightly-scheduled.yml | 424 +++++++++--------- .github/workflows/self-push.yml | 6 +- .github/workflows/self-scheduled.yml | 2 +- docker/transformers-all-latest-gpu/Dockerfile | 16 +- .../Dockerfile | 12 +- .../Dockerfile | 35 ++ 7 files changed, 321 insertions(+), 227 deletions(-) create mode 100644 docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index 5ef1a9bac6..295f668d4d 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -39,6 +39,33 @@ jobs: push: true tags: huggingface/transformers-all-latest-gpu + latest-with-torch-nightly-docker: + name: "Nightly PyTorch + Stable TensorFlow" + runs-on: ubuntu-latest + steps: + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + - + name: Check out code + uses: actions/checkout@v2 + - + name: Login to DockerHub + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v2 + with: + context: ./docker/transformers-all-latest-gpu + build-args: | + REF=main + PYTORCH=pre + push: true + tags: huggingface/transformers-all-latest-torch-nightly-gpu + latest-torch-deepspeed-docker: name: "Latest PyTorch + DeepSpeed" runs-on: ubuntu-latest @@ -65,6 +92,32 @@ jobs: push: true tags: huggingface/transformers-pytorch-deepspeed-latest-gpu + nightly-torch-deepspeed-docker: + name: "Nightly PyTorch + DeepSpeed" + runs-on: ubuntu-latest + steps: + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + - + name: Check out code + uses: actions/checkout@v2 + - + name: Login to DockerHub + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v2 + with: + context: ./docker/transformers-pytorch-deepspeed-nightly-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-deepspeed-nightly-gpu + doc-builder: name: "Doc builder" runs-on: ubuntu-latest diff --git a/.github/workflows/self-nightly-scheduled.yml b/.github/workflows/self-nightly-scheduled.yml index 4b4c0aca9d..5dca2c07b1 100644 --- a/.github/workflows/self-nightly-scheduled.yml +++ b/.github/workflows/self-nightly-scheduled.yml @@ -1,250 +1,236 @@ -name: Self-hosted runner; Nightly (scheduled) +name: Self-hosted runner (nightly) + +# Note that each job's dependencies go into a corresponding docker file. +# +# For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is +# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at +# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile` on: - push: - branches: - - nightly_ci* - repository_dispatch: - schedule: - - cron: "0 0 */3 * *" + repository_dispatch: + schedule: + - cron: "0 16 * * *" env: - HF_HOME: /mnt/cache - TRANSFORMERS_IS_CI: yes - RUN_SLOW: yes - OMP_NUM_THREADS: 16 - MKL_NUM_THREADS: 16 - PYTEST_TIMEOUT: 600 - SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} + HF_HOME: /mnt/cache + TRANSFORMERS_IS_CI: yes + OMP_NUM_THREADS: 8 + MKL_NUM_THREADS: 8 + RUN_SLOW: yes + SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} + TF_FORCE_GPU_ALLOW_GROWTH: true + RUN_PT_TF_CROSS_TESTS: 1 jobs: - run_all_tests_torch_gpu: - runs-on: [self-hosted, docker-gpu, single-gpu] - container: - image: pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - steps: - - name: Launcher docker - uses: actions/checkout@v2 + setup: + name: Setup + strategy: + matrix: + machine_type: [single-gpu, multi-gpu] + runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + container: + image: huggingface/transformers-all-latest-torch-nightly-gpu + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + steps: + - name: Update clone + working-directory: /transformers + run: | + git fetch && git checkout ${{ github.sha }} - - name: NVIDIA-SMI - run: | - nvidia-smi + - name: Cleanup + working-directory: /transformers + run: | + rm -rf tests/__pycache__ + rm -rf tests/models/__pycache__ + rm -rf reports - - name: Install dependencies - run: | - apt -y update && apt install -y libsndfile1-dev git espeak-ng - pip install --upgrade pip - pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm] - pip install https://github.com/kpu/kenlm/archive/master.zip - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U + - id: set-matrix + name: Identify models to test + working-directory: /transformers/tests + run: | + echo "::set-output name=matrix::$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')" - - name: Are GPUs recognized by our DL frameworks - run: | - utils/print_env.py + - name: NVIDIA-SMI + run: | + nvidia-smi - - name: Run all tests on GPU - run: | - python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_gpu tests + run_tests_single_gpu: + name: Model tests + strategy: + fail-fast: false + matrix: + folders: ${{ fromJson(needs.setup.outputs.matrix) }} + machine_type: [single-gpu] + runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + container: + image: huggingface/transformers-all-latest-torch-nightly-gpu + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + needs: setup + steps: + - name: Echo folder ${{ matrix.folders }} + shell: bash + # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # set the artifact folder names (because the character `/` is not allowed). + run: | + echo "${{ matrix.folders }}" + matrix_folders=${{ matrix.folders }} + matrix_folders=${matrix_folders/'models/'/'models_'} + echo "$matrix_folders" + echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - name: Failure short reports - if: ${{ always() }} - run: cat reports/tests_torch_gpu/failures_short.txt + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} - - name: Run examples tests on GPU - if: ${{ always() }} - env: - OMP_NUM_THREADS: 16 - MKL_NUM_THREADS: 16 - RUN_SLOW: yes - HF_HOME: /mnt/cache - TRANSFORMERS_IS_CI: yes - run: | - pip install -r examples/pytorch/_tests_requirements.txt - python -m pytest -n 1 -v --dist=loadfile --make-reports=examples_torch_gpu examples + - name: NVIDIA-SMI + run: | + nvidia-smi - - name: Failure short reports - if: ${{ always() }} - run: cat reports/examples_torch_gpu/failures_short.txt + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py - - name: Run all pipeline tests on GPU - if: ${{ always() }} - env: - RUN_PIPELINE_TESTS: yes - run: | - python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests + - name: Run all tests on GPU + working-directory: /transformers + run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - - name: Failure short reports - if: ${{ always() }} - run: cat reports/tests_torch_pipeline_gpu/failures_short.txt + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - - name: Test suite reports artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v2 - with: - name: run_all_tests_torch_gpu_test_reports - path: reports + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v2 + with: + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} - run_all_tests_torch_multi_gpu: - runs-on: [self-hosted, docker-gpu, multi-gpu] - container: - image: pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - steps: - - name: Launcher docker - uses: actions/checkout@v2 + run_tests_multi_gpu: + name: Model tests + strategy: + fail-fast: false + matrix: + folders: ${{ fromJson(needs.setup.outputs.matrix) }} + machine_type: [multi-gpu] + runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + container: + image: huggingface/transformers-all-latest-torch-nightly-gpu + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + needs: setup + steps: + - name: Echo folder ${{ matrix.folders }} + shell: bash + # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # set the artifact folder names (because the character `/` is not allowed). + run: | + echo "${{ matrix.folders }}" + matrix_folders=${{ matrix.folders }} + matrix_folders=${matrix_folders/'models/'/'models_'} + echo "$matrix_folders" + echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - name: NVIDIA-SMI - continue-on-error: true - run: | - nvidia-smi + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} - - name: Install dependencies - run: | - apt -y update && apt install -y libsndfile1-dev git espeak-ng - pip install --upgrade pip - pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm] - pip install https://github.com/kpu/kenlm/archive/master.zip - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U + - name: NVIDIA-SMI + run: | + nvidia-smi - - name: Are GPUs recognized by our DL frameworks - run: | - utils/print_env.py + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py - - name: Run all tests on GPU - env: - MKL_SERVICE_FORCE_INTEL: 1 - run: | - python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_multi_gpu tests + - name: Run all tests on GPU + working-directory: /transformers + run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - - name: Failure short reports - if: ${{ always() }} - run: cat reports/tests_torch_multi_gpu/failures_short.txt + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt - - name: Run all pipeline tests on GPU - if: ${{ always() }} - env: - RUN_PIPELINE_TESTS: yes - run: | - python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v2 + with: + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} - - name: Failure short reports - if: ${{ always() }} - run: cat reports/tests_torch_pipeline_multi_gpu/failures_short.txt + run_all_tests_torch_cuda_extensions_gpu: + name: Torch CUDA extension tests + strategy: + fail-fast: false + matrix: + machine_type: [single-gpu, multi-gpu] + runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + needs: setup + container: + image: huggingface/transformers-pytorch-deepspeed-nightly-gpu + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Update clone + working-directory: /workspace/transformers + run: git fetch && git checkout ${{ github.sha }} - - name: Test suite reports artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v2 - with: - name: run_all_tests_torch_multi_gpu_test_reports - path: reports + # To avoid unknown test failures + - name: Pre build DeepSpeed *again* + working-directory: /workspace + run: | + python3 -m pip uninstall -y deepspeed + rm -rf DeepSpeed + git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build + DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - run_all_tests_torch_cuda_extensions_gpu: - runs-on: [self-hosted, docker-gpu, single-gpu] - container: - image: nvcr.io/nvidia/pytorch:21.03-py3 - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - steps: - - name: Launcher docker - uses: actions/checkout@v2 + - name: NVIDIA-SMI + run: | + nvidia-smi - - name: NVIDIA-SMI - run: | - nvidia-smi + - name: Environment + working-directory: /workspace/transformers + run: | + python utils/print_env.py - - name: Install dependencies - run: | - apt -y update && apt install -y libaio-dev libsndfile1-dev git espeak-ng - pip install --upgrade pip - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U - pip install .[deepspeed-testing] - pip install https://github.com/kpu/kenlm/archive/master.zip - pip install git+https://github.com/microsoft/DeepSpeed + - name: Run all tests on GPU + working-directory: /workspace/transformers + run: | + python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended - - name: Are GPUs recognized by our DL frameworks - run: | - utils/print_env.py + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt - - name: Run all tests on GPU - run: | - python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v2 + with: + name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports + path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu - - name: Failure short reports - if: ${{ always() }} - run: cat reports/tests_torch_cuda_extensions_gpu/failures_short.txt - - - name: Test suite reports artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v2 - with: - name: run_tests_torch_cuda_extensions_gpu_test_reports - path: reports - - run_all_tests_torch_cuda_extensions_multi_gpu: - runs-on: [self-hosted, docker-gpu, multi-gpu] - container: - image: nvcr.io/nvidia/pytorch:21.03-py3 - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - steps: - - name: Launcher docker - uses: actions/checkout@v2 - - - name: NVIDIA-SMI - continue-on-error: true - run: | - nvidia-smi - - - name: Install dependencies - run: | - apt -y update && apt install -y libaio-dev libsndfile1-dev git espeak-ng - pip install --upgrade pip - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U - rm -rf ~/.cache/torch_extensions/ # shared between conflicting builds - pip install .[testing,fairscale] - pip install https://github.com/kpu/kenlm/archive/master.zip - pip install git+https://github.com/microsoft/DeepSpeed # testing bleeding edge - - - name: Are GPUs recognized by our DL frameworks - run: | - utils/print_env.py - - - name: Run all tests on GPU - run: | - python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended - - - name: Failure short reports - if: ${{ always() }} - run: cat reports/tests_torch_cuda_extensions_multi_gpu/failures_short.txt - - - name: Test suite reports artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v2 - with: - name: run_tests_torch_cuda_extensions_multi_gpu_test_reports - path: reports - - send_results: - name: Send results to webhook - runs-on: ubuntu-latest - if: always() - needs: [ - run_all_tests_torch_gpu, - run_all_tests_torch_multi_gpu, - run_all_tests_torch_cuda_extensions_gpu, - run_all_tests_torch_cuda_extensions_multi_gpu - ] - steps: - - uses: actions/checkout@v2 - - - uses: actions/download-artifact@v2 - - - name: Send message to Slack - env: - CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} - CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} - CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} - CI_SLACK_CHANNEL_ID_PAST_FUTURE: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }} - - run: | - pip install slack_sdk - python utils/notification_service.py scheduled nightly-torch + send_results: + name: Send results to webhook + runs-on: ubuntu-latest + if: always() + needs: [setup, run_tests_single_gpu, run_tests_multi_gpu, run_all_tests_torch_cuda_extensions_gpu] + steps: + - uses: actions/checkout@v2 + - uses: actions/download-artifact@v2 + - name: Send message to Slack + env: + CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} + CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} + CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} + CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} + CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }} + CI_EVENT: nightly-build + # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change + # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. + run: | + pip install slack_sdk + python utils/notification_service.py "${{ needs.setup.outputs.matrix }}" diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml index a40a3b3eac..0251a32a1a 100644 --- a/.github/workflows/self-push.yml +++ b/.github/workflows/self-push.yml @@ -207,7 +207,7 @@ jobs: # To avoid unknown test failures - name: Pre build DeepSpeed *again* - working-directory: /workspace/transformers + working-directory: /workspace run: | python3 -m pip uninstall -y deepspeed DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check @@ -217,10 +217,12 @@ jobs: nvidia-smi - name: Environment + working-directory: /workspace/transformers run: | python utils/print_env.py - name: Run all non-slow selected tests on GPU + working-directory: /workspace/transformers # TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests. run: | python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended @@ -256,7 +258,7 @@ jobs: # To avoid unknown test failures - name: Pre build DeepSpeed *again* - working-directory: /workspace/transformers + working-directory: /workspace run: | python3 -m pip uninstall -y deepspeed DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 8797287ba4..c1a4f3fd37 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -308,7 +308,7 @@ jobs: # To avoid unknown test failures - name: Pre build DeepSpeed *again* - working-directory: /workspace/transformers + working-directory: /workspace run: | python3 -m pip uninstall -y deepspeed DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile index 2bb12a4ebe..9c493844c2 100644 --- a/docker/transformers-all-latest-gpu/Dockerfile +++ b/docker/transformers-all-latest-gpu/Dockerfile @@ -3,6 +3,9 @@ LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive +# Use login shell to read variables from `~/.profile` (to pass dynamic created variables between RUN commands) +SHELL ["sh", "-lc"] + # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant # to be used as arguments for docker build (so far). @@ -21,11 +24,20 @@ ARG REF=main RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] -RUN python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA +# TODO: Handle these in a python utility script +RUN [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile +RUN echo torch=$VERSION +# `torchvision` and `torchaudio` should be installed along with `torch`, especially for nightly build. +# Currently, let's just use their latest releases (when `torch` is installed with a release version) +# TODO: We might need to specify proper versions that work with a specific torch version (especially for past CI). +RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA + RUN python3 -m pip install --no-cache-dir -U tensorflow RUN python3 -m pip uninstall -y flax jax -RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$PYTORCH+$CUDA.html +# Use installed torch version for `torch-scatter` to avid to deal with PYTORCH='pre'. +# If torch is nightly version, the link is likely to be invalid, but the installation falls back to the latest torch-scatter +RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python3 -c "from torch import version; print(version.__version__.split('+')[0])")+$CUDA.html RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT+cpu -f https://software.intel.com/ipex-whl-stable RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile index bd62628989..529c248314 100644 --- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile +++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile @@ -3,6 +3,9 @@ LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive +# Example: `cu102`, `cu113`, etc. +ARG CUDA='cu113' + RUN apt -y update RUN apt install -y libaio-dev RUN python3 -m pip install --no-cache-dir --upgrade pip @@ -13,13 +16,16 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers && # Install latest release PyTorch # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.) # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops) -RUN python3 -m pip install --no-cache-dir -U torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113 +RUN python3 -m pip install --no-cache-dir -U torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] -# Pre-build DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout) +# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout) RUN python3 -m pip uninstall -y deepspeed -RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check +# This has to be run (again) inside the GPU VMs running the tests. +# The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests. +# TODO: Find out why test fail. +RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1 # When installing in editable mode, `transformers` is not recognized as a package. # this line must be added in order for python to be aware of transformers. diff --git a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile new file mode 100644 index 0000000000..3f880dd95d --- /dev/null +++ b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile @@ -0,0 +1,35 @@ +FROM nvcr.io/nvidia/pytorch:21.03-py3 +LABEL maintainer="Hugging Face" + +ARG DEBIAN_FRONTEND=noninteractive + +# Example: `cu102`, `cu113`, etc. +ARG CUDA='cu113' + +RUN apt -y update +RUN apt install -y libaio-dev +RUN python3 -m pip install --no-cache-dir --upgrade pip + +ARG REF=main +RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF + +# Install **nightly** release PyTorch (flag `--pre`) +# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.) +# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops) +RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA + +RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing] + +# Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout) +RUN python3 -m pip uninstall -y deepspeed +# This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.) +# Issue: https://github.com/microsoft/DeepSpeed/issues/2010 +# RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \ +# DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1 + +# When installing in editable mode, `transformers` is not recognized as a package. +# this line must be added in order for python to be aware of transformers. +RUN cd transformers && python3 setup.py develop + +# Disable for now as deepspeed is not installed above. To be enabled once the issue is fixed. +# RUN python3 -c "from deepspeed.launcher.runner import main"