HuggingFace_transformer/.github/workflows/self-scheduled.yml

name: Self-hosted runner (scheduled)

on:
  repository_dispatch:
  schedule:
    - cron: "0 2 * * *"

env:
  HF_HOME: /mnt/cache
  TRANSFORMERS_IS_CI: yes
  OMP_NUM_THREADS: 8
  MKL_NUM_THREADS: 8
  RUN_SLOW: yes
  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
  TF_FORCE_GPU_ALLOW_GROWTH: true
  RUN_PT_TF_CROSS_TESTS: 1

jobs:
  setup:
    name: Setup
    strategy:
      matrix:
        machines: [multi-gpu-docker, single-gpu-docker]
    runs-on: ${{ matrix.machines }}
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    outputs:
      matrix: ${{ steps.set-matrix.outputs.matrix }}
    steps:
      - name: Update clone
        working-directory: /transformers
        run: |
          git fetch && git checkout ${{ github.sha }}

      - name: Cleanup
        working-directory: /transformers
        run: |
          rm -rf tests/__pycache__
          rm -rf reports

      - id: set-matrix
        name: Identify models to test
        working-directory: /transformers/tests
        run: |
          echo "::set-output name=matrix::$(python3 -c 'import os; x = list(filter(os.path.isdir, os.listdir(os.getcwd()))); x.sort(); print(x)')"

      - name: NVIDIA-SMI
        run: |
          nvidia-smi

      - name: GPU visibility
        working-directory: /transformers
        run: |
          utils/print_env_pt.py
          TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
          TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"

  run_tests_gpu:
    name: Model tests
    strategy:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
        machines: [multi-gpu-docker, single-gpu-docker]
    runs-on: ${{ matrix.machines }}
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    needs: setup
    steps:
      - name: Echo folder ${{ matrix.folders }}
        run: echo "${{ matrix.folders }}"

      - name: Update clone
        working-directory: /transformers
        run: git fetch && git checkout ${{ github.sha }}

      - name: Run all non-slow tests on GPU
        working-directory: /transformers
        run: python3 -m pytest -v --make-reports=${{ matrix.machines }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: cat /transformers/reports/${{ matrix.machines }}_tests_gpu_${{ matrix.folders }}/failures_short.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: ${{ matrix.machines }}_run_all_tests_gpu_${{ matrix.folders }}_test_reports
          path: /transformers/reports/${{ matrix.machines }}_tests_gpu_${{ matrix.folders }}

  run_examples_gpu:
    name: Examples directory
    runs-on: [self-hosted, single-gpu-docker]
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    needs: setup
    steps:
      - name: Update clone
        working-directory: /transformers
        run: git fetch && git checkout ${{ github.sha }}

      - name: Run examples tests on GPU
        working-directory: /transformers
        run: |
          pip install -r examples/pytorch/_tests_requirements.txt
          python3 -m pytest -v --make-reports=examples_gpu examples/pytorch

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: cat /transformers/reports/examples_gpu/failures_short.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: run_examples_gpu
          path: /transformers/reports/examples_gpu

  run_pipelines_torch_gpu:
    name: PyTorch pipelines
    strategy:
      fail-fast: false
      matrix:
        machines: [multi-gpu-docker, single-gpu-docker]
    runs-on: ${{ matrix.machines }}
    container:
      image: huggingface/transformers-pytorch-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    needs: setup
    steps:
      - name: Update clone
        working-directory: /transformers
        run: git fetch && git checkout ${{ github.sha }}

      - name: Run all pipeline tests on GPU
        working-directory: /transformers
        env:
          RUN_PIPELINE_TESTS: yes
        run: |
          python3 -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=${{ matrix.machines }}_tests_torch_pipeline_gpu tests

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: cat /transformers/reports/${{ matrix.machines }}_tests_torch_pipeline_gpu/failures_short.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: ${{ matrix.machines }}_run_tests_torch_pipeline_gpu
          path: /transformers/reports/${{ matrix.machines }}_tests_torch_pipeline_gpu

  run_pipelines_tf_gpu:
    name: TensorFlow pipelines
    strategy:
      fail-fast: false
      matrix:
        machines: [multi-gpu-docker, single-gpu-docker]
    runs-on: ${{ matrix.machines }}
    container:
      image: huggingface/transformers-tensorflow-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    needs: setup
    steps:
      - name: Update clone
        working-directory: /transformers
        run: |
          git fetch && git checkout ${{ github.sha }}

      - name: Run all pipeline tests on GPU
        working-directory: /transformers
        env:
          RUN_PIPELINE_TESTS: yes
        run: |
          python3 -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=${{ matrix.machines }}_tests_tf_pipeline_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: |
          cat /transformers/reports/${{ matrix.machines }}_tests_tf_pipeline_gpu/failures_short.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: ${{ matrix.machines }}_run_tests_tf_pipeline_gpu
          path: /transformers/reports/${{ matrix.machines }}_tests_tf_pipeline_gpu

  run_all_tests_torch_cuda_extensions_gpu:
    name: Torch CUDA extension tests
    strategy:
      fail-fast: false
      matrix:
        machines: [multi-gpu-docker, single-gpu-docker]
    runs-on: ${{ matrix.machines }}
    needs: setup
    container:
      image: huggingface/transformers-pytorch-deepspeed-latest-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Update clone
        working-directory: /workspace/transformers
        run: git fetch && git checkout ${{ github.sha }}

      - name: Re-compile DeepSpeed
        working-directory: /workspace
        run: |
          pip install deepspeed # installs the deps correctly
          rm -rf DeepSpeed
          git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
          DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install -e . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check

      - name: Run all tests on GPU
        working-directory: /workspace/transformers
        run: |
          python -m pytest -v --make-reports=${{ matrix.machines }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: cat /workspace/transformers/reports/${{ matrix.machines }}_tests_torch_cuda_extensions_gpu/failures_short.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: ${{ matrix.machines }}_run_tests_torch_cuda_extensions_gpu_test_reports
          path: /workspace/transformers/reports/${{ matrix.machines }}_tests_torch_cuda_extensions_gpu


  send_results:
    name: Send results to webhook
    runs-on: ubuntu-latest
    if: always()
    needs: [setup, run_tests_gpu, run_examples_gpu, run_pipelines_tf_gpu, run_pipelines_torch_gpu, run_all_tests_torch_cuda_extensions_gpu]
    steps:
      - uses: actions/checkout@v2
      - uses: actions/download-artifact@v2
      - name: Send message to Slack
        env:
          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
          CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
        run: |
          pip install slack_sdk
          python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"