Nightly torch ci (#13550)

* Nightly CI torch * Version * Reformat * Only subset Fix * Revert * Better formatting * New channel
2021-09-13 16:17:29 -04:00
parent 5c14fceac0
commit 3ab0185b06
2 changed files with 280 additions and 5 deletions
--- a/.github/workflows/self-nightly-scheduled.yml
+++ b/.github/workflows/self-nightly-scheduled.yml
@@ -0,0 +1,255 @@
 name: Self-hosted runner; Nightly (scheduled)
 on:
    push:
        branches:
            - nightly_ci*
    repository_dispatch:
    schedule:
        - cron: "0 0 */3 * *"
 env:
    HF_HOME: /mnt/cache
    TRANSFORMERS_IS_CI: yes
    RUN_SLOW: yes
    OMP_NUM_THREADS: 16
    MKL_NUM_THREADS: 16
    PYTEST_TIMEOUT: 600
 jobs:
    run_all_tests_torch_gpu:
        runs-on: [self-hosted, docker-gpu, single-gpu]
        container:
            image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
            options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
        steps:
            - name: Launcher docker
              uses: actions/checkout@v2
            - name: NVIDIA-SMI
              run: |
                  nvidia-smi
            - name: Install dependencies
              run: |
                  apt -y update && apt install -y libsndfile1-dev git
                  pip install --upgrade pip
                  pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
            - name: Are GPUs recognized by our DL frameworks
              run: |
                  python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
                  python -c "import torch; print('Cuda version:', torch.version.cuda)"
                  python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
                  python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
            - name: Run all tests on GPU
              run: |
                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_gpu tests
            - name: Failure short reports
              if: ${{ always() }}
              run: cat reports/tests_torch_gpu_failures_short.txt
            - name: Run examples tests on GPU
              if: ${{ always() }}
              env:
                  OMP_NUM_THREADS: 16
                  MKL_NUM_THREADS: 16
                  RUN_SLOW: yes
                  HF_HOME: /mnt/cache
                  TRANSFORMERS_IS_CI: yes
              run: |
                  pip install -r examples/pytorch/_tests_requirements.txt
                  python -m pytest -n 1 -v --dist=loadfile --make-reports=examples_torch_gpu examples
            - name: Failure short reports
              if: ${{ always() }}
              run: cat reports/examples_torch_gpu_failures_short.txt
            - name: Run all pipeline tests on GPU
              if: ${{ always() }}
              env:
                  RUN_PIPELINE_TESTS: yes
              run: |
                  python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
            - name: Failure short reports
              if: ${{ always() }}
              run: cat reports/tests_torch_pipeline_gpu_failures_short.txt
            - name: Test suite reports artifacts
              if: ${{ always() }}
              uses: actions/upload-artifact@v2
              with:
                  name: run_all_tests_torch_gpu_test_reports
                  path: reports
    run_all_tests_torch_multi_gpu:
        runs-on: [self-hosted, docker-gpu, multi-gpu]
        container:
            image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
            options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
        steps:
            - name: Launcher docker
              uses: actions/checkout@v2
            - name: NVIDIA-SMI
              continue-on-error: true
              run: |
                  nvidia-smi
            - name: Install dependencies
              run: |
                  apt -y update && apt install -y libsndfile1-dev git
                  pip install --upgrade pip
                  pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
                pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
            - name: Are GPUs recognized by our DL frameworks
              run: |
                  python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
                  python -c "import torch; print('Cuda version:', torch.version.cuda)"
                  python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
                  python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
            - name: Run all tests on GPU
              env:
                  MKL_SERVICE_FORCE_INTEL: 1
              run: |
                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_multi_gpu tests
            - name: Failure short reports
              if: ${{ always() }}
              run: cat reports/tests_torch_multi_gpu_failures_short.txt
            - name: Run all pipeline tests on GPU
              if: ${{ always() }}
              env:
                  RUN_PIPELINE_TESTS: yes
              run: |
                  python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
            - name: Failure short reports
              if: ${{ always() }}
              run: cat reports/tests_torch_pipeline_multi_gpu_failures_short.txt
            - name: Test suite reports artifacts
              if: ${{ always() }}
              uses: actions/upload-artifact@v2
              with:
                  name: run_all_tests_torch_multi_gpu_test_reports
                  path: reports
    run_all_tests_torch_cuda_extensions_gpu:
        runs-on: [self-hosted, docker-gpu, single-gpu]
        container:
            image: nvcr.io/nvidia/pytorch:21.03-py3
            options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
        steps:
            - name: Launcher docker
              uses: actions/checkout@v2
            - name: NVIDIA-SMI
              run: |
                  nvidia-smi
            - name: Install dependencies
              run: |
                  apt -y update && apt install -y libaio-dev
                  pip install --upgrade pip
                  pip install .[testing,deepspeed]
                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
            - name: Are GPUs recognized by our DL frameworks
              run: |
                  python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
                  python -c "import torch; print('Cuda version:', torch.version.cuda)"
                  python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
                  python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
            - name: Run all tests on GPU
              run: |
                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
            - name: Failure short reports
              if: ${{ always() }}
              run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt
            - name: Test suite reports artifacts
              if: ${{ always() }}
              uses: actions/upload-artifact@v2
              with:
                  name: run_tests_torch_cuda_extensions_gpu_test_reports
                  path: reports
    run_all_tests_torch_cuda_extensions_multi_gpu:
        runs-on: [self-hosted, docker-gpu, multi-gpu]
        container:
            image: nvcr.io/nvidia/pytorch:21.03-py3
            options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
        steps:
            - name: Launcher docker
              uses: actions/checkout@v2
            - name: NVIDIA-SMI
              continue-on-error: true
              run: |
                  nvidia-smi
            - name: Install dependencies
              run: |
                  apt -y update && apt install -y libaio-dev
                  pip install --upgrade pip
                  pip install .[testing,deepspeed,fairscale]
            - name: Are GPUs recognized by our DL frameworks
              run: |
                  python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
                  python -c "import torch; print('Cuda version:', torch.version.cuda)"
                  python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
                  python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
            - name: Run all tests on GPU
              run: |
                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
            - name: Failure short reports
              if: ${{ always() }}
              run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt
            - name: Test suite reports artifacts
              if: ${{ always() }}
              uses: actions/upload-artifact@v2
              with:
                  name: run_tests_torch_cuda_extensions_multi_gpu_test_reports
                  path: reports
    send_results:
        name: Send results to webhook
        runs-on: ubuntu-latest
        if: always()
        needs: [
                run_all_tests_torch_gpu,
                run_all_tests_torch_multi_gpu,
                run_all_tests_torch_cuda_extensions_gpu,
                run_all_tests_torch_cuda_extensions_multi_gpu
        ]
        steps:
            - uses: actions/checkout@v2
            - uses: actions/download-artifact@v2
            - name: Send message to Slack
              env:
                  CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
                  CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
                  CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
                  CI_SLACK_CHANNEL_ID_PAST_FUTURE: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
              run: |
                  pip install slack_sdk
                  python utils/notification_service.py scheduled nightly-torch
--- a/utils/notification_service.py
+++ b/utils/notification_service.py
@@ -38,13 +38,13 @@ def handle_test_results(test_results):
    return failed, success, time_spent
-def format_for_slack(total_results, results, scheduled: bool):
+def format_for_slack(total_results, results, scheduled: bool, title: str):
    print(total_results, results)
    header = {
        "type": "header",
        "text": {
            "type": "plain_text",
-            "text": "🤗 Results of the scheduled tests." if scheduled else "🤗 Self-push results",
+            "text": title,
            "emoji": True,
        },
    }
@@ -105,7 +105,13 @@ def format_for_slack(total_results, results, scheduled: bool):
 if __name__ == "__main__":
-    scheduled = sys.argv[1] == "scheduled"
+    arguments = sys.argv[1:]
    if "scheduled" in arguments:
        arguments.remove("scheduled")
        scheduled = True
    else:
        scheduled = False
    if scheduled:
        # The scheduled run has several artifacts for each job.
@@ -149,7 +155,21 @@ if __name__ == "__main__":
        }
    client = WebClient(token=os.environ["CI_SLACK_BOT_TOKEN"])
-    channel_id = os.environ["CI_SLACK_CHANNEL_ID_DAILY"] if scheduled else os.environ["CI_SLACK_CHANNEL_ID"]
+
    if not scheduled:
        channel_id = os.environ["CI_SLACK_CHANNEL_ID"]
    elif scheduled and len(arguments):
        channel_id = os.environ["CI_SLACK_CHANNEL_ID_PAST_FUTURE"]
    else:
        channel_id = os.environ["CI_SLACK_CHANNEL_ID_DAILY"]
    if scheduled:
        title = "🤗 Results of the scheduled tests."
    else:
        title = "🤗 Self-push results"
    if len(arguments):
        title = f"{arguments} " + title
    try:
        results = {}
@@ -182,7 +202,7 @@ if __name__ == "__main__":
                total[result_key] += job_result[result_key]
        if total["failed"] != 0 or scheduled:
-            to_be_sent_to_slack = format_for_slack(total, results, scheduled)
+            to_be_sent_to_slack = format_for_slack(total, results, scheduled, title)
            result = client.chat_postMessage(
                channel=channel_id,