diff --git a/.github/workflows/self-nightly-scheduled.yml b/.github/workflows/self-nightly-scheduled.yml new file mode 100644 index 0000000000..4c2fbd226e --- /dev/null +++ b/.github/workflows/self-nightly-scheduled.yml @@ -0,0 +1,255 @@ +name: Self-hosted runner; Nightly (scheduled) + +on: + push: + branches: + - nightly_ci* + repository_dispatch: + schedule: + - cron: "0 0 */3 * *" + +env: + HF_HOME: /mnt/cache + TRANSFORMERS_IS_CI: yes + RUN_SLOW: yes + OMP_NUM_THREADS: 16 + MKL_NUM_THREADS: 16 + PYTEST_TIMEOUT: 600 + +jobs: + run_all_tests_torch_gpu: + runs-on: [self-hosted, docker-gpu, single-gpu] + container: + image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Launcher docker + uses: actions/checkout@v2 + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Install dependencies + run: | + apt -y update && apt install -y libsndfile1-dev git + pip install --upgrade pip + pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm] + pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U + + - name: Are GPUs recognized by our DL frameworks + run: | + python -c "import torch; print('Cuda available:', torch.cuda.is_available())" + python -c "import torch; print('Cuda version:', torch.version.cuda)" + python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())" + python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" + + - name: Run all tests on GPU + run: | + python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_gpu tests + + - name: Failure short reports + if: ${{ always() }} + run: cat reports/tests_torch_gpu_failures_short.txt + + - name: Run examples tests on GPU + if: ${{ always() }} + env: + OMP_NUM_THREADS: 16 + MKL_NUM_THREADS: 16 + RUN_SLOW: yes + HF_HOME: /mnt/cache + TRANSFORMERS_IS_CI: yes + run: | + pip install -r examples/pytorch/_tests_requirements.txt + python -m pytest -n 1 -v --dist=loadfile --make-reports=examples_torch_gpu examples + + - name: Failure short reports + if: ${{ always() }} + run: cat reports/examples_torch_gpu_failures_short.txt + + - name: Run all pipeline tests on GPU + if: ${{ always() }} + env: + RUN_PIPELINE_TESTS: yes + run: | + python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests + + - name: Failure short reports + if: ${{ always() }} + run: cat reports/tests_torch_pipeline_gpu_failures_short.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v2 + with: + name: run_all_tests_torch_gpu_test_reports + path: reports + + run_all_tests_torch_multi_gpu: + runs-on: [self-hosted, docker-gpu, multi-gpu] + container: + image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Launcher docker + uses: actions/checkout@v2 + + - name: NVIDIA-SMI + continue-on-error: true + run: | + nvidia-smi + + - name: Install dependencies + run: | + apt -y update && apt install -y libsndfile1-dev git + pip install --upgrade pip + pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm] + pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U + + - name: Are GPUs recognized by our DL frameworks + run: | + python -c "import torch; print('Cuda available:', torch.cuda.is_available())" + python -c "import torch; print('Cuda version:', torch.version.cuda)" + python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())" + python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" + + - name: Run all tests on GPU + env: + MKL_SERVICE_FORCE_INTEL: 1 + run: | + python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_multi_gpu tests + + - name: Failure short reports + if: ${{ always() }} + run: cat reports/tests_torch_multi_gpu_failures_short.txt + + - name: Run all pipeline tests on GPU + if: ${{ always() }} + env: + RUN_PIPELINE_TESTS: yes + run: | + python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests + + - name: Failure short reports + if: ${{ always() }} + run: cat reports/tests_torch_pipeline_multi_gpu_failures_short.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v2 + with: + name: run_all_tests_torch_multi_gpu_test_reports + path: reports + + run_all_tests_torch_cuda_extensions_gpu: + runs-on: [self-hosted, docker-gpu, single-gpu] + container: + image: nvcr.io/nvidia/pytorch:21.03-py3 + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Launcher docker + uses: actions/checkout@v2 + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Install dependencies + run: | + apt -y update && apt install -y libaio-dev + pip install --upgrade pip + pip install .[testing,deepspeed] + pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U + + + - name: Are GPUs recognized by our DL frameworks + run: | + python -c "import torch; print('Cuda available:', torch.cuda.is_available())" + python -c "import torch; print('Cuda version:', torch.version.cuda)" + python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())" + python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" + + - name: Run all tests on GPU + run: | + python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended + + - name: Failure short reports + if: ${{ always() }} + run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v2 + with: + name: run_tests_torch_cuda_extensions_gpu_test_reports + path: reports + + run_all_tests_torch_cuda_extensions_multi_gpu: + runs-on: [self-hosted, docker-gpu, multi-gpu] + container: + image: nvcr.io/nvidia/pytorch:21.03-py3 + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Launcher docker + uses: actions/checkout@v2 + + - name: NVIDIA-SMI + continue-on-error: true + run: | + nvidia-smi + + - name: Install dependencies + run: | + apt -y update && apt install -y libaio-dev + pip install --upgrade pip + pip install .[testing,deepspeed,fairscale] + + - name: Are GPUs recognized by our DL frameworks + run: | + python -c "import torch; print('Cuda available:', torch.cuda.is_available())" + python -c "import torch; print('Cuda version:', torch.version.cuda)" + python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())" + python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" + + - name: Run all tests on GPU + run: | + python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended + pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U + + - name: Failure short reports + if: ${{ always() }} + run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v2 + with: + name: run_tests_torch_cuda_extensions_multi_gpu_test_reports + path: reports + + send_results: + name: Send results to webhook + runs-on: ubuntu-latest + if: always() + needs: [ + run_all_tests_torch_gpu, + run_all_tests_torch_multi_gpu, + run_all_tests_torch_cuda_extensions_gpu, + run_all_tests_torch_cuda_extensions_multi_gpu + ] + steps: + - uses: actions/checkout@v2 + + - uses: actions/download-artifact@v2 + + - name: Send message to Slack + env: + CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} + CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} + CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} + CI_SLACK_CHANNEL_ID_PAST_FUTURE: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }} + + run: | + pip install slack_sdk + python utils/notification_service.py scheduled nightly-torch diff --git a/utils/notification_service.py b/utils/notification_service.py index 735b06c0d1..2296c16396 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -38,13 +38,13 @@ def handle_test_results(test_results): return failed, success, time_spent -def format_for_slack(total_results, results, scheduled: bool): +def format_for_slack(total_results, results, scheduled: bool, title: str): print(total_results, results) header = { "type": "header", "text": { "type": "plain_text", - "text": "🤗 Results of the scheduled tests." if scheduled else "🤗 Self-push results", + "text": title, "emoji": True, }, } @@ -105,7 +105,13 @@ def format_for_slack(total_results, results, scheduled: bool): if __name__ == "__main__": - scheduled = sys.argv[1] == "scheduled" + arguments = sys.argv[1:] + + if "scheduled" in arguments: + arguments.remove("scheduled") + scheduled = True + else: + scheduled = False if scheduled: # The scheduled run has several artifacts for each job. @@ -149,7 +155,21 @@ if __name__ == "__main__": } client = WebClient(token=os.environ["CI_SLACK_BOT_TOKEN"]) - channel_id = os.environ["CI_SLACK_CHANNEL_ID_DAILY"] if scheduled else os.environ["CI_SLACK_CHANNEL_ID"] + + if not scheduled: + channel_id = os.environ["CI_SLACK_CHANNEL_ID"] + elif scheduled and len(arguments): + channel_id = os.environ["CI_SLACK_CHANNEL_ID_PAST_FUTURE"] + else: + channel_id = os.environ["CI_SLACK_CHANNEL_ID_DAILY"] + + if scheduled: + title = "🤗 Results of the scheduled tests." + else: + title = "🤗 Self-push results" + + if len(arguments): + title = f"{arguments} " + title try: results = {} @@ -182,7 +202,7 @@ if __name__ == "__main__": total[result_key] += job_result[result_key] if total["failed"] != 0 or scheduled: - to_be_sent_to_slack = format_for_slack(total, results, scheduled) + to_be_sent_to_slack = format_for_slack(total, results, scheduled, title) result = client.chat_postMessage( channel=channel_id,