diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml index 5f408e88fc..8af6f8ea5c 100644 --- a/.github/workflows/self-push.yml +++ b/.github/workflows/self-push.yml @@ -10,73 +10,42 @@ on: - "tests/**" - ".github/**" - "templates/**" - # pull_request: repository_dispatch: - jobs: run_tests_torch_gpu: - runs-on: [self-hosted, gpu, single-gpu] + runs-on: [self-hosted, docker-gpu, single-gpu] + container: + image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - - uses: actions/checkout@v2 - - name: Python version + - name: Launcher docker + uses: actions/checkout@v2 + + - name: NVIDIA-SMI run: | - which python - python --version - pip --version - - - name: Current dir - run: pwd - - - run: nvidia-smi - - - name: Kill any run-away pytest processes - run: (pkill -f tests; pkill -f examples) || echo "no zombies" - - - name: Loading cache. - uses: actions/cache@v2 - id: cache - with: - path: .env - key: v1.2-tests_torch_gpu-${{ hashFiles('setup.py') }} - - - name: Create new python env (on self-hosted runners we have to handle isolation ourselves) - run: | - python -m venv .env - source .env/bin/activate - which python - python --version - pip --version + nvidia-smi - name: Install dependencies run: | - source .env/bin/activate - sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev + apt -y update && apt install -y libsndfile1-dev pip install --upgrade pip - pip install .[torch,sklearn,testing,onnxruntime,sentencepiece,speech] - pip install git+https://github.com/huggingface/datasets + pip install .[sklearn,testing,onnxruntime,sentencepiece,speech] - name: Are GPUs recognized by our DL frameworks run: | - source .env/bin/activate python -c "import torch; print('Cuda available:', torch.cuda.is_available())" + python -c "import torch; print('Cuda version:', torch.version.cuda)" + python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())" python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" -# - name: Create model files -# run: | -# source .env/bin/activate -# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model -# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model -# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model -# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model - - name: Run all non-slow tests on GPU env: - OMP_NUM_THREADS: 1 - CUDA_VISIBLE_DEVICES: 0 + OMP_NUM_THREADS: 8 + MKL_NUM_THREADS: 8 + HF_HOME: /mnt/cache run: | - source .env/bin/activate - python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_torch_gpu tests + python -m pytest -n 2 --dist=loadfile --make-reports=tests_torch_gpu tests - name: Failure short reports if: ${{ always() }} @@ -89,68 +58,38 @@ jobs: name: run_all_tests_torch_gpu_test_reports path: reports - run_tests_tf_gpu: - runs-on: [self-hosted, gpu, single-gpu] + runs-on: [self-hosted, docker-gpu, single-gpu] + container: + image: tensorflow/tensorflow:2.4.1-gpu + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - - uses: actions/checkout@v2 - - name: Python version + - name: Launcher docker + uses: actions/checkout@v2 + + - name: NVIDIA-SMI run: | - which python - python --version - pip --version - - - name: Current dir - run: pwd - - - run: nvidia-smi - - - name: Kill any run-away pytest processes - run: (pkill -f tests; pkill -f examples) || echo "no zombies" - - - name: Loading cache. - uses: actions/cache@v2 - id: cache - with: - path: .env - key: v1.2-tests_tf_gpu-${{ hashFiles('setup.py') }} - - - name: Create new python env (on self-hosted runners we have to handle isolation ourselves) - run: | - python -m venv .env - source .env/bin/activate - which python - python --version - pip --version + nvidia-smi - name: Install dependencies run: | - source .env/bin/activate pip install --upgrade pip - pip install .[tf,sklearn,testing,onnxruntime,sentencepiece] - pip install git+https://github.com/huggingface/datasets + pip install .[sklearn,testing,onnxruntime,sentencepiece] - name: Are GPUs recognized by our DL frameworks run: | - source .env/bin/activate TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" - - name: Create model files - run: | - source .env/bin/activate -# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model -# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model -# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model -# transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model - - name: Run all non-slow tests on GPU env: - OMP_NUM_THREADS: 1 - CUDA_VISIBLE_DEVICES: 0 + OMP_NUM_THREADS: 8 + MKL_NUM_THREADS: 8 + TF_NUM_INTRAOP_THREADS: 8 + TF_NUM_INTEROP_THREADS: 1 + HF_HOME: /mnt/cache run: | - source .env/bin/activate - python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_tf_gpu tests + python -m pytest -n 2 --dist=loadfile --make-reports=tests_tf_gpu tests - name: Failure short reports if: ${{ always() }} @@ -163,58 +102,41 @@ jobs: name: run_all_tests_tf_gpu_test_reports path: reports + run_tests_torch_multi_gpu: - runs-on: [self-hosted, gpu, multi-gpu] + runs-on: [self-hosted, docker-gpu, multi-gpu] + container: + image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - - uses: actions/checkout@v2 - - name: Python version + - name: Launcher docker + uses: actions/checkout@v2 + + - name: NVIDIA-SMI run: | - which python - python --version - pip --version + nvidia-smi - - name: Current dir - run: pwd - - - run: nvidia-smi - - - name: Kill any run-away pytest processes - run: (pkill -f tests; pkill -f examples) || echo "no zombies" - - - name: Loading cache. - uses: actions/cache@v2 - id: cache - with: - path: .env - key: v1.2-tests_torch_multi_gpu-${{ hashFiles('setup.py') }} - - - name: Create new python env (on self-hosted runners we have to handle isolation ourselves) - run: | - python -m venv .env - source .env/bin/activate - which python - python --version - pip --version - name: Install dependencies run: | - source .env/bin/activate - sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev + apt -y update && apt install -y libsndfile1-dev pip install --upgrade pip - pip install .[torch,sklearn,testing,onnxruntime,sentencepiece,speech] - pip install git+https://github.com/huggingface/datasets + pip install .[sklearn,testing,onnxruntime,sentencepiece,speech] - name: Are GPUs recognized by our DL frameworks run: | - source .env/bin/activate python -c "import torch; print('Cuda available:', torch.cuda.is_available())" + python -c "import torch; print('Cuda version:', torch.version.cuda)" + python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())" python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" - name: Run all non-slow tests on GPU env: - OMP_NUM_THREADS: 1 + OMP_NUM_THREADS: 8 + MKL_NUM_THREADS: 8 + MKL_SERVICE_FORCE_INTEL: 1 + HF_HOME: /mnt/cache run: | - source .env/bin/activate - python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_torch_multi_gpu tests + python -m pytest -n 2 --dist=loadfile --make-reports=tests_torch_multi_gpu tests - name: Failure short reports if: ${{ always() }} @@ -228,56 +150,37 @@ jobs: path: reports run_tests_tf_multi_gpu: - runs-on: [self-hosted, gpu, multi-gpu] + runs-on: [self-hosted, docker-gpu, multi-gpu] + container: + image: tensorflow/tensorflow:2.4.1-gpu + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - - uses: actions/checkout@v2 - - name: Python version + - name: Launcher docker + uses: actions/checkout@v2 + + - name: NVIDIA-SMI run: | - which python - python --version - pip --version + nvidia-smi - - name: Current dir - run: pwd - - - run: nvidia-smi - - - name: Kill any run-away pytest processes - run: (pkill -f tests; pkill -f examples) || echo "no zombies" - - - name: Loading cache. - uses: actions/cache@v2 - id: cache - with: - path: .env - key: v1.2-tests_tf_multi_gpu-${{ hashFiles('setup.py') }} - - - name: Create new python env (on self-hosted runners we have to handle isolation ourselves) - run: | - python -m venv .env - source .env/bin/activate - which python - python --version - pip --version - name: Install dependencies run: | - source .env/bin/activate pip install --upgrade pip - pip install .[tf,sklearn,testing,onnxruntime,sentencepiece] - pip install git+https://github.com/huggingface/datasets + pip install .[sklearn,testing,onnxruntime,sentencepiece] - name: Are GPUs recognized by our DL frameworks run: | - source .env/bin/activate TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" - name: Run all non-slow tests on GPU env: - OMP_NUM_THREADS: 1 + OMP_NUM_THREADS: 8 + MKL_NUM_THREADS: 8 + TF_NUM_INTRAOP_THREADS: 8 + TF_NUM_INTEROP_THREADS: 1 + HF_HOME: /mnt/cache run: | - source .env/bin/activate - python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_tf_multi_gpu tests + python -m pytest -n 2 --dist=loadfile --make-reports=tests_tf_multi_gpu tests - name: Failure short reports if: ${{ always() }} @@ -289,3 +192,22 @@ jobs: with: name: run_all_tests_tf_multi_gpu_test_reports path: reports + + send_results: + name: Send results to webhook + runs-on: ubuntu-latest + if: always() + needs: [run_tests_torch_gpu, run_tests_tf_gpu, run_tests_torch_multi_gpu, run_tests_tf_multi_gpu] + steps: + - uses: actions/checkout@v2 + + - uses: actions/download-artifact@v2 + + - name: Send message to Slack + env: + CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} + CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} + + run: | + pip install slack_sdk + python utils/notification_service.py push \ No newline at end of file diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 66e3487f39..5072041113 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -1,8 +1,3 @@ -# configuration notes: -# -# - `source .env/bin/activate` is currently needed to be run first thing first in each step. Otherwise -# the step uses the system-wide python interpreter. - name: Self-hosted runner (scheduled) on: @@ -15,61 +10,39 @@ on: jobs: run_all_tests_torch_gpu: - runs-on: [self-hosted, gpu, single-gpu] + runs-on: [self-hosted, docker-gpu, single-gpu] + container: + image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - - uses: actions/checkout@v2 + - name: Launcher docker + uses: actions/checkout@v2 - - name: Loading cache. - uses: actions/cache@v2 - id: cache - with: - path: .env - key: v 1.2-slow_tests_torch_gpu-${{ hashFiles('setup.py') }} - - - name: Python version + - name: NVIDIA-SMI run: | - which python - python --version - pip --version - - - name: Current dir - run: pwd - - - run: nvidia-smi - - - name: Kill any run-away pytest processes - run: (pkill -f tests; pkill -f examples) || echo "no zombies" - - - name: Create new python env (on self-hosted runners we have to handle isolation ourselves) - if: steps.cache.outputs.cache-hit != 'true' - run: | - python -m venv .env - source .env/bin/activate - which python - python --version - pip --version + nvidia-smi - name: Install dependencies run: | - source .env/bin/activate + apt -y update && apt install -y libsndfile1-dev pip install --upgrade pip - pip install .[torch,sklearn,testing,onnxruntime,sentencepiece] - pip install git+https://github.com/huggingface/datasets - pip list + pip install .[sklearn,testing,onnxruntime,sentencepiece,speech] - name: Are GPUs recognized by our DL frameworks run: | - source .env/bin/activate python -c "import torch; print('Cuda available:', torch.cuda.is_available())" + python -c "import torch; print('Cuda version:', torch.version.cuda)" + python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())" python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" - name: Run all tests on GPU env: - OMP_NUM_THREADS: 1 + OMP_NUM_THREADS: 16 + MKL_NUM_THREADS: 16 RUN_SLOW: yes + HF_HOME: /mnt/cache run: | - source .env/bin/activate - python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_gpu tests + python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_gpu tests - name: Failure short reports if: ${{ always() }} @@ -78,12 +51,13 @@ jobs: - name: Run examples tests on GPU if: ${{ always() }} env: - OMP_NUM_THREADS: 1 + OMP_NUM_THREADS: 16 + MKL_NUM_THREADS: 16 RUN_SLOW: yes + HF_HOME: /mnt/cache run: | - source .env/bin/activate pip install -r examples/_tests_requirements.txt - python -m pytest -n 1 --dist=loadfile -s --make-reports=examples_torch_gpu examples + python -m pytest -n 1 --dist=loadfile --make-reports=examples_torch_gpu examples - name: Failure short reports if: ${{ always() }} @@ -92,13 +66,13 @@ jobs: - name: Run all pipeline tests on GPU if: ${{ always() }} env: - TF_FORCE_GPU_ALLOW_GROWTH: "true" - OMP_NUM_THREADS: 1 + OMP_NUM_THREADS: 16 + MKL_NUM_THREADS: 16 RUN_SLOW: yes RUN_PIPELINE_TESTS: yes + HF_HOME: /mnt/cache run: | - source .env/bin/activate - python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests + python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests - name: Failure short reports if: ${{ always() }} @@ -111,64 +85,39 @@ jobs: name: run_all_tests_torch_gpu_test_reports path: reports - run_all_tests_tf_gpu: - runs-on: [self-hosted, gpu, single-gpu] + runs-on: [self-hosted, docker-gpu, single-gpu] + container: + image: tensorflow/tensorflow:2.4.1-gpu + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - - uses: actions/checkout@v2 + - name: Launcher docker + uses: actions/checkout@v2 - - name: Loading cache. - uses: actions/cache@v2 - id: cache - with: - path: .env - key: v1.2-slow_tests_tf_gpu-${{ hashFiles('setup.py') }} - - - name: Python version + - name: NVIDIA-SMI run: | - which python - python --version - pip --version - - - name: Current dir - run: pwd - - - run: nvidia-smi - - - name: Kill any run-away pytest processes - run: (pkill -f tests; pkill -f examples) || echo "no zombies" - - - - name: Create new python env (on self-hosted runners we have to handle isolation ourselves) - if: steps.cache.outputs.cache-hit != 'true' - run: | - python -m venv .env - source .env/bin/activate - which python - python --version - pip --version + nvidia-smi - name: Install dependencies run: | - source .env/bin/activate pip install --upgrade pip - pip install .[tf,sklearn,testing,onnxruntime,sentencepiece] - pip install git+https://github.com/huggingface/datasets - pip list + pip install .[sklearn,testing,onnx,sentencepiece] - name: Are GPUs recognized by our DL frameworks run: | - source .env/bin/activate TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" - name: Run all tests on GPU env: - OMP_NUM_THREADS: 1 RUN_SLOW: yes + HF_HOME: /mnt/cache + OMP_NUM_THREADS: 16 + TF_NUM_INTEROP_THREADS: 1 + TF_NUM_INTRAOP_THREADS: 16 + MKL_NUM_THREADS: 16 run: | - source .env/bin/activate - python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_gpu tests + python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu tests - name: Failure short reports if: ${{ always() }} @@ -177,17 +126,19 @@ jobs: - name: Run all pipeline tests on GPU if: ${{ always() }} env: - TF_FORCE_GPU_ALLOW_GROWTH: "true" - OMP_NUM_THREADS: 1 RUN_SLOW: yes + HF_HOME: /mnt/cache + OMP_NUM_THREADS: 16 RUN_PIPELINE_TESTS: yes + TF_NUM_INTEROP_THREADS: 1 + TF_NUM_INTRAOP_THREADS: 16 + MKL_NUM_THREADS: 16 run: | - source .env/bin/activate - python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipelines_gpu tests + python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_gpu tests - name: Failure short reports if: ${{ always() }} - run: cat reports/tests_tf_pipelines_gpu_failures_short.txt + run: cat reports/tests_tf_pipeline_gpu_failures_short.txt - name: Test suite reports artifacts if: ${{ always() }} @@ -197,92 +148,55 @@ jobs: path: reports run_all_tests_torch_multi_gpu: - runs-on: [self-hosted, gpu, multi-gpu] + runs-on: [self-hosted, docker-gpu, multi-gpu] + container: + image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - - uses: actions/checkout@v2 + - name: Launcher docker + uses: actions/checkout@v2 - - name: Loading cache. - uses: actions/cache@v2 - id: cache - with: - path: .env - key: v1.2-slow_tests_torch_multi_gpu-${{ hashFiles('setup.py') }} - - - name: Python version + - name: NVIDIA-SMI run: | - which python - python --version - pip --version - - - name: Current dir - run: pwd - - - run: nvidia-smi - - - name: Kill any run-away pytest processes - run: (pkill -f tests; pkill -f examples) || echo "no zombies" - - - name: Create new python env (on self-hosted runners we have to handle isolation ourselves) - if: steps.cache.outputs.cache-hit != 'true' - run: | - python -m venv .env - source .env/bin/activate - which python - python --version - pip --version + nvidia-smi - name: Install dependencies run: | - source .env/bin/activate + apt -y update && apt install -y libsndfile1-dev pip install --upgrade pip - pip install .[torch,sklearn,testing,onnxruntime,sentencepiece] - pip install git+https://github.com/huggingface/datasets - pip install fairscale - pip install deepspeed - pip list + pip install .[sklearn,testing,onnxruntime,sentencepiece,speech] - name: Are GPUs recognized by our DL frameworks run: | - source .env/bin/activate python -c "import torch; print('Cuda available:', torch.cuda.is_available())" + python -c "import torch; print('Cuda version:', torch.version.cuda)" + python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())" python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" - - name: Run all tests on multi-GPU + - name: Run all tests on GPU env: - OMP_NUM_THREADS: 1 RUN_SLOW: yes + HF_HOME: /mnt/cache + OMP_NUM_THREADS: 16 + MKL_NUM_THREADS: 16 + MKL_SERVICE_FORCE_INTEL: 1 run: | - source .env/bin/activate - python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_multi_gpu tests + python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_multi_gpu tests - name: Failure short reports if: ${{ always() }} run: cat reports/tests_torch_multi_gpu_failures_short.txt - - name: Run examples tests on multi-GPU + - name: Run all pipeline tests on GPU if: ${{ always() }} env: - OMP_NUM_THREADS: 1 - RUN_SLOW: yes - run: | - source .env/bin/activate - pip install -r examples/_tests_requirements.txt - python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_examples_multi_gpu examples - - - name: Failure short reports - if: ${{ always() }} - run: cat reports/tests_torch_examples_multi_gpu_failures_short.txt - - - name: Run all pipeline tests on multi-GPU - if: ${{ always() }} - env: - TF_FORCE_GPU_ALLOW_GROWTH: "true" - OMP_NUM_THREADS: 1 + OMP_NUM_THREADS: 16 + MKL_NUM_THREADS: 16 RUN_SLOW: yes RUN_PIPELINE_TESTS: yes + HF_HOME: /mnt/cache run: | - source .env/bin/activate - python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests + python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests - name: Failure short reports if: ${{ always() }} @@ -296,76 +210,55 @@ jobs: path: reports run_all_tests_tf_multi_gpu: - runs-on: [self-hosted, gpu, multi-gpu] + runs-on: [self-hosted, docker-gpu, multi-gpu] + container: + image: tensorflow/tensorflow:2.4.1-gpu + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - - uses: actions/checkout@v2 + - name: Launcher docker + uses: actions/checkout@v2 - - name: Loading cache. - uses: actions/cache@v2 - id: cache - with: - path: .env - key: v1.2-slow_tests_tf_multi_gpu-${{ hashFiles('setup.py') }} - - - name: Python version + - name: NVIDIA-SMI run: | - which python - python --version - pip --version - - - name: Current dir - run: pwd - - - run: nvidia-smi - - - name: Kill any run-away pytest processes - run: (pkill -f tests; pkill -f examples) || echo "no zombies" - - - name: Create new python env (on self-hosted runners we have to handle isolation ourselves) - if: steps.cache.outputs.cache-hit != 'true' - run: | - python -m venv .env - source .env/bin/activate - which python - python --version - pip --version + nvidia-smi - name: Install dependencies run: | - source .env/bin/activate pip install --upgrade pip - pip install .[tf,sklearn,testing,onnxruntime,sentencepiece] - pip install git+https://github.com/huggingface/datasets - pip list + pip install .[sklearn,testing,onnx,sentencepiece] - name: Are GPUs recognized by our DL frameworks run: | - source .env/bin/activate TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" - - name: Run all tests on multi-GPU + - name: Run all tests on GPU env: - OMP_NUM_THREADS: 1 + OMP_NUM_THREADS: 16 RUN_SLOW: yes + MKL_NUM_THREADS: 16 + TF_NUM_INTEROP_THREADS: 1 + TF_NUM_INTRAOP_THREADS: 16 + HF_HOME: /mnt/cache run: | - source .env/bin/activate - python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_multi_gpu tests + python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu tests - name: Failure short reports if: ${{ always() }} run: cat reports/tests_tf_multi_gpu_failures_short.txt - - name: Run all pipeline tests on multi-GPU + - name: Run all pipeline tests on GPU if: ${{ always() }} env: - TF_FORCE_GPU_ALLOW_GROWTH: "true" - OMP_NUM_THREADS: 1 + OMP_NUM_THREADS: 16 RUN_SLOW: yes RUN_PIPELINE_TESTS: yes + MKL_NUM_THREADS: 16 + TF_NUM_INTEROP_THREADS: 1 + TF_NUM_INTRAOP_THREADS: 16 + HF_HOME: /mnt/cache run: | - source .env/bin/activate - python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests + python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests - name: Failure short reports if: ${{ always() }} @@ -377,3 +270,23 @@ jobs: with: name: run_all_tests_tf_multi_gpu_test_reports path: reports + + send_results: + name: Send results to webhook + runs-on: ubuntu-latest + if: always() + needs: [run_all_tests_torch_gpu, run_all_tests_tf_gpu, run_all_tests_torch_multi_gpu, run_all_tests_tf_multi_gpu] + steps: + - uses: actions/checkout@v2 + + - uses: actions/download-artifact@v2 + + - name: Send message to Slack + env: + CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} + CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} + + + run: | + pip install slack_sdk + python utils/notification_service.py scheduled diff --git a/setup.py b/setup.py index 7903198180..16567d71c0 100644 --- a/setup.py +++ b/setup.py @@ -115,6 +115,7 @@ _deps = [ "psutil", "pydantic", "pytest", + "pytest-sugar", "pytest-xdist", "python>=3.6.0", "recommonmark", @@ -225,6 +226,7 @@ else: extras["tokenizers"] = deps_list("tokenizers") extras["onnxruntime"] = deps_list("onnxruntime", "onnxruntime-tools") +extras["onnx"] = deps_list("onnxconverter-common", "keras2onnx") + extras["onnxruntime"] extras["modelcreation"] = deps_list("cookiecutter") extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette") @@ -232,7 +234,7 @@ extras["speech"] = deps_list("soundfile", "torchaudio") extras["sentencepiece"] = deps_list("sentencepiece", "protobuf") extras["testing"] = ( - deps_list("pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets") + deps_list("pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-sugar") + extras["retrieval"] + extras["modelcreation"] ) diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 6022ac220b..576fbe7cd6 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -28,6 +28,7 @@ deps = { "psutil": "psutil", "pydantic": "pydantic", "pytest": "pytest", + "pytest-sugar": "pytest-sugar", "pytest-xdist": "pytest-xdist", "python": "python>=3.6.0", "recommonmark": "recommonmark", diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 13838fab40..063aba5553 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -137,6 +137,17 @@ def slow(test_case): return test_case +def tooslow(test_case): + """ + Decorator marking a test as too slow. + + Slow tests are skipped while they're in the process of being fixed. No test should stay tagged as "tooslow" as + these will not be tested by the CI. + + """ + return unittest.skip("test is too slow")(test_case) + + def custom_tokenizers(test_case): """ Decorator marking a test for a custom tokenizer. diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index 6f66350a9c..a2f7085660 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -25,7 +25,14 @@ from importlib import import_module from typing import List, Tuple from transformers import is_tf_available -from transformers.testing_utils import _tf_gpu_memory_limit, is_pt_tf_cross_test, require_onnx, require_tf, slow +from transformers.testing_utils import ( + _tf_gpu_memory_limit, + is_pt_tf_cross_test, + require_onnx, + require_tf, + slow, + tooslow, +) if is_tf_available(): @@ -129,7 +136,7 @@ class TFModelTesterMixin: self.assert_outputs_same(after_outputs, outputs) - @slow + @tooslow def test_graph_mode(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: @@ -143,7 +150,7 @@ class TFModelTesterMixin: outputs = run_in_graph_mode() self.assertIsNotNone(outputs) - @slow + @tooslow def test_xla_mode(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: @@ -184,7 +191,7 @@ class TFModelTesterMixin: expected_arg_names = ["input_ids"] self.assertListEqual(arg_names[:1], expected_arg_names) - @slow + @tooslow def test_saved_model_creation(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.output_hidden_states = False @@ -205,7 +212,7 @@ class TFModelTesterMixin: saved_model_dir = os.path.join(tmpdirname, "saved_model", "1") self.assertTrue(os.path.exists(saved_model_dir)) - @slow + @tooslow def test_saved_model_creation_extended(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.output_hidden_states = True @@ -314,7 +321,7 @@ class TFModelTesterMixin: onnxruntime.InferenceSession(onnx_model.SerializeToString()) - @slow + @tooslow def test_mixed_precision(self): tf.keras.mixed_precision.experimental.set_policy("mixed_float16") @@ -488,7 +495,7 @@ class TFModelTesterMixin: max_diff = np.amax(np.abs(tfo - pto)) self.assertLessEqual(max_diff, 4e-2) - @slow + @tooslow def test_train_pipeline_custom_model(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() # head_mask and decoder_head_mask has different shapes than other input args @@ -909,7 +916,7 @@ class TFModelTesterMixin: model(inputs) - @slow + @tooslow def test_graph_mode_with_inputs_embeds(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/utils/notification_service.py b/utils/notification_service.py new file mode 100644 index 0000000000..fb3fdebcf8 --- /dev/null +++ b/utils/notification_service.py @@ -0,0 +1,185 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import sys + +from slack_sdk import WebClient + + +def handle_test_results(test_results): + expressions = test_results.split(" ") + + failed = 0 + success = 0 + + # When the output is short enough, the output is surrounded by = signs: "== OUTPUT ==" + # When it is too long, those signs are not present. + time_spent = expressions[-2] if "=" in expressions[-1] else expressions[-1] + + for i, expression in enumerate(expressions): + if "failed" in expression: + failed += int(expressions[i - 1]) + if "passed" in expression: + success += int(expressions[i - 1]) + + return failed, success, time_spent + + +def format_for_slack(total_results, results, scheduled: bool): + print(results) + header = { + "type": "header", + "text": { + "type": "plain_text", + "text": "šŸ¤— Results of the scheduled tests, March 11, 2021." if scheduled else "šŸ¤— Self-push results", + "emoji": True, + }, + } + + total = ( + { + "type": "section", + "fields": [ + {"type": "mrkdwn", "text": f"*Failures:*\nāŒ {total_results['failed']} failures."}, + {"type": "mrkdwn", "text": f"*Passed:*\nāœ… {total_results['success']} tests passed."}, + ], + } + if total_results["failed"] > 0 + else { + "type": "section", + "fields": [{"type": "mrkdwn", "text": f"*Congrats!*\nAll {total_results['success']} tests pass."}], + } + ) + + blocks = [header, total] + + if total_results["failed"] > 0: + for key, result in results.items(): + print(key, result) + blocks.append({"type": "header", "text": {"type": "plain_text", "text": key, "emoji": True}}) + blocks.append( + { + "type": "section", + "fields": [ + { + "type": "mrkdwn", + "text": f"*Results:*\n{result['failed']} failed, {result['success']} passed.", + }, + {"type": "mrkdwn", "text": f"*Time spent:*\n{result['time_spent']}"}, + ], + } + ) + else: + for key, result in results.items(): + blocks.append( + {"type": "section", "fields": [{"type": "mrkdwn", "text": f"*{key}*\n{result['time_spent']}."}]} + ) + + footer = { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "" + if scheduled + else "", + }, + } + + blocks.append(footer) + + blocks = {"blocks": blocks} + + return blocks + + +if __name__ == "__main__": + scheduled = sys.argv[1] == "scheduled" + + if scheduled: + # The scheduled run has several artifacts for each job. + file_paths = { + "TF Single GPU": { + "common": "run_all_tests_tf_gpu_test_reports/tests_tf_gpu_[].txt", + "pipeline": "run_all_tests_tf_gpu_test_reports/tests_tf_pipeline_gpu_[].txt", + }, + "Torch Single GPU": { + "common": "run_all_tests_torch_gpu_test_reports/tests_torch_gpu_[].txt", + "pipeline": "run_all_tests_torch_gpu_test_reports/tests_torch_pipeline_gpu_[].txt", + "examples": "run_all_tests_torch_gpu_test_reports/examples_torch_gpu_[].txt", + }, + "TF Multi GPU": { + "common": "run_all_tests_tf_multi_gpu_test_reports/tests_tf_multi_gpu_[].txt", + "pipeline": "run_all_tests_tf_multi_gpu_test_reports/tests_tf_pipeline_multi_gpu_[].txt", + }, + "Torch Multi GPU": { + "common": "run_all_tests_torch_multi_gpu_test_reports/tests_torch_multi_gpu_[].txt", + "pipeline": "run_all_tests_torch_multi_gpu_test_reports/tests_torch_pipeline_multi_gpu_[].txt", + }, + } + else: + file_paths = { + "TF Single GPU": {"common": "run_all_tests_tf_gpu_test_reports/tests_tf_gpu_[].txt"}, + "Torch Single GPU": {"common": "run_all_tests_torch_gpu_test_reports/tests_torch_gpu_[].txt"}, + "TF Multi GPU": {"common": "run_all_tests_tf_multi_gpu_test_reports/tests_tf_multi_gpu_[].txt"}, + "Torch Multi GPU": {"common": "run_all_tests_torch_multi_gpu_test_reports/tests_torch_multi_gpu_[].txt"}, + } + + client = WebClient(token=os.environ["CI_SLACK_BOT_TOKEN"]) + channel_id = os.environ["CI_SLACK_CHANNEL_ID"] + + try: + results = {} + for job, file_dict in file_paths.items(): + + # Single return value for failed/success across steps of a same job + results[job] = {"failed": 0, "success": 0, "time_spent": "", "failures": ""} + + for key, file_path in file_dict.items(): + with open(file_path.replace("[]", "stats")) as f: + failed, success, time_spent = handle_test_results(f.read()) + results[job]["failed"] += failed + results[job]["success"] += success + results[job]["time_spent"] += time_spent[1:-1] + ", " + with open(file_path.replace("[]", "summary_short")) as f: + for line in f: + if re.search("FAILED", line): + results[job]["failures"] += line + + # Remove the trailing ", " + results[job]["time_spent"] = results[job]["time_spent"][:-2] + + test_results_keys = ["failed", "success"] + total = {"failed": 0, "success": 0} + for job, job_result in results.items(): + for result_key in test_results_keys: + total[result_key] += job_result[result_key] + + to_be_sent_to_slack = format_for_slack(total, results, scheduled) + + result = client.chat_postMessage( + channel=channel_id, + blocks=to_be_sent_to_slack["blocks"], + ) + + for job, job_result in results.items(): + if len(job_result["failures"]): + client.chat_postMessage( + channel=channel_id, text=f"{job}\n{job_result['failures']}", thread_ts=result["ts"] + ) + + except Exception as e: + # Voluntarily catch every exception and send it to Slack. + raise Exception(f"Setup error: no artifacts were found. Error: {e}") from e