Nightly torch ci (#13550)
* Nightly CI torch * Version * Reformat * Only subset Fix * Revert * Better formatting * New channel
This commit is contained in:
255
.github/workflows/self-nightly-scheduled.yml
vendored
Normal file
255
.github/workflows/self-nightly-scheduled.yml
vendored
Normal file
@@ -0,0 +1,255 @@
|
||||
name: Self-hosted runner; Nightly (scheduled)
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- nightly_ci*
|
||||
repository_dispatch:
|
||||
schedule:
|
||||
- cron: "0 0 */3 * *"
|
||||
|
||||
env:
|
||||
HF_HOME: /mnt/cache
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
RUN_SLOW: yes
|
||||
OMP_NUM_THREADS: 16
|
||||
MKL_NUM_THREADS: 16
|
||||
PYTEST_TIMEOUT: 600
|
||||
|
||||
jobs:
|
||||
run_all_tests_torch_gpu:
|
||||
runs-on: [self-hosted, docker-gpu, single-gpu]
|
||||
container:
|
||||
image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: Launcher docker
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt -y update && apt install -y libsndfile1-dev git
|
||||
pip install --upgrade pip
|
||||
pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
|
||||
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
|
||||
|
||||
- name: Are GPUs recognized by our DL frameworks
|
||||
run: |
|
||||
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
|
||||
python -c "import torch; print('Cuda version:', torch.version.cuda)"
|
||||
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
|
||||
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
|
||||
|
||||
- name: Run all tests on GPU
|
||||
run: |
|
||||
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_gpu tests
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
run: cat reports/tests_torch_gpu_failures_short.txt
|
||||
|
||||
- name: Run examples tests on GPU
|
||||
if: ${{ always() }}
|
||||
env:
|
||||
OMP_NUM_THREADS: 16
|
||||
MKL_NUM_THREADS: 16
|
||||
RUN_SLOW: yes
|
||||
HF_HOME: /mnt/cache
|
||||
TRANSFORMERS_IS_CI: yes
|
||||
run: |
|
||||
pip install -r examples/pytorch/_tests_requirements.txt
|
||||
python -m pytest -n 1 -v --dist=loadfile --make-reports=examples_torch_gpu examples
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
run: cat reports/examples_torch_gpu_failures_short.txt
|
||||
|
||||
- name: Run all pipeline tests on GPU
|
||||
if: ${{ always() }}
|
||||
env:
|
||||
RUN_PIPELINE_TESTS: yes
|
||||
run: |
|
||||
python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
run: cat reports/tests_torch_pipeline_gpu_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: run_all_tests_torch_gpu_test_reports
|
||||
path: reports
|
||||
|
||||
run_all_tests_torch_multi_gpu:
|
||||
runs-on: [self-hosted, docker-gpu, multi-gpu]
|
||||
container:
|
||||
image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
|
||||
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: Launcher docker
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
continue-on-error: true
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt -y update && apt install -y libsndfile1-dev git
|
||||
pip install --upgrade pip
|
||||
pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
|
||||
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
|
||||
|
||||
- name: Are GPUs recognized by our DL frameworks
|
||||
run: |
|
||||
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
|
||||
python -c "import torch; print('Cuda version:', torch.version.cuda)"
|
||||
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
|
||||
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
|
||||
|
||||
- name: Run all tests on GPU
|
||||
env:
|
||||
MKL_SERVICE_FORCE_INTEL: 1
|
||||
run: |
|
||||
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_multi_gpu tests
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
run: cat reports/tests_torch_multi_gpu_failures_short.txt
|
||||
|
||||
- name: Run all pipeline tests on GPU
|
||||
if: ${{ always() }}
|
||||
env:
|
||||
RUN_PIPELINE_TESTS: yes
|
||||
run: |
|
||||
python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
run: cat reports/tests_torch_pipeline_multi_gpu_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: run_all_tests_torch_multi_gpu_test_reports
|
||||
path: reports
|
||||
|
||||
run_all_tests_torch_cuda_extensions_gpu:
|
||||
runs-on: [self-hosted, docker-gpu, single-gpu]
|
||||
container:
|
||||
image: nvcr.io/nvidia/pytorch:21.03-py3
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: Launcher docker
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt -y update && apt install -y libaio-dev
|
||||
pip install --upgrade pip
|
||||
pip install .[testing,deepspeed]
|
||||
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
|
||||
|
||||
|
||||
- name: Are GPUs recognized by our DL frameworks
|
||||
run: |
|
||||
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
|
||||
python -c "import torch; print('Cuda version:', torch.version.cuda)"
|
||||
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
|
||||
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
|
||||
|
||||
- name: Run all tests on GPU
|
||||
run: |
|
||||
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: run_tests_torch_cuda_extensions_gpu_test_reports
|
||||
path: reports
|
||||
|
||||
run_all_tests_torch_cuda_extensions_multi_gpu:
|
||||
runs-on: [self-hosted, docker-gpu, multi-gpu]
|
||||
container:
|
||||
image: nvcr.io/nvidia/pytorch:21.03-py3
|
||||
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
steps:
|
||||
- name: Launcher docker
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: NVIDIA-SMI
|
||||
continue-on-error: true
|
||||
run: |
|
||||
nvidia-smi
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
apt -y update && apt install -y libaio-dev
|
||||
pip install --upgrade pip
|
||||
pip install .[testing,deepspeed,fairscale]
|
||||
|
||||
- name: Are GPUs recognized by our DL frameworks
|
||||
run: |
|
||||
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
|
||||
python -c "import torch; print('Cuda version:', torch.version.cuda)"
|
||||
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
|
||||
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
|
||||
|
||||
- name: Run all tests on GPU
|
||||
run: |
|
||||
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
|
||||
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ always() }}
|
||||
run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt
|
||||
|
||||
- name: Test suite reports artifacts
|
||||
if: ${{ always() }}
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: run_tests_torch_cuda_extensions_multi_gpu_test_reports
|
||||
path: reports
|
||||
|
||||
send_results:
|
||||
name: Send results to webhook
|
||||
runs-on: ubuntu-latest
|
||||
if: always()
|
||||
needs: [
|
||||
run_all_tests_torch_gpu,
|
||||
run_all_tests_torch_multi_gpu,
|
||||
run_all_tests_torch_cuda_extensions_gpu,
|
||||
run_all_tests_torch_cuda_extensions_multi_gpu
|
||||
]
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- uses: actions/download-artifact@v2
|
||||
|
||||
- name: Send message to Slack
|
||||
env:
|
||||
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
|
||||
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
|
||||
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
|
||||
CI_SLACK_CHANNEL_ID_PAST_FUTURE: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
|
||||
|
||||
run: |
|
||||
pip install slack_sdk
|
||||
python utils/notification_service.py scheduled nightly-torch
|
||||
@@ -38,13 +38,13 @@ def handle_test_results(test_results):
|
||||
return failed, success, time_spent
|
||||
|
||||
|
||||
def format_for_slack(total_results, results, scheduled: bool):
|
||||
def format_for_slack(total_results, results, scheduled: bool, title: str):
|
||||
print(total_results, results)
|
||||
header = {
|
||||
"type": "header",
|
||||
"text": {
|
||||
"type": "plain_text",
|
||||
"text": "🤗 Results of the scheduled tests." if scheduled else "🤗 Self-push results",
|
||||
"text": title,
|
||||
"emoji": True,
|
||||
},
|
||||
}
|
||||
@@ -105,7 +105,13 @@ def format_for_slack(total_results, results, scheduled: bool):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
scheduled = sys.argv[1] == "scheduled"
|
||||
arguments = sys.argv[1:]
|
||||
|
||||
if "scheduled" in arguments:
|
||||
arguments.remove("scheduled")
|
||||
scheduled = True
|
||||
else:
|
||||
scheduled = False
|
||||
|
||||
if scheduled:
|
||||
# The scheduled run has several artifacts for each job.
|
||||
@@ -149,7 +155,21 @@ if __name__ == "__main__":
|
||||
}
|
||||
|
||||
client = WebClient(token=os.environ["CI_SLACK_BOT_TOKEN"])
|
||||
channel_id = os.environ["CI_SLACK_CHANNEL_ID_DAILY"] if scheduled else os.environ["CI_SLACK_CHANNEL_ID"]
|
||||
|
||||
if not scheduled:
|
||||
channel_id = os.environ["CI_SLACK_CHANNEL_ID"]
|
||||
elif scheduled and len(arguments):
|
||||
channel_id = os.environ["CI_SLACK_CHANNEL_ID_PAST_FUTURE"]
|
||||
else:
|
||||
channel_id = os.environ["CI_SLACK_CHANNEL_ID_DAILY"]
|
||||
|
||||
if scheduled:
|
||||
title = "🤗 Results of the scheduled tests."
|
||||
else:
|
||||
title = "🤗 Self-push results"
|
||||
|
||||
if len(arguments):
|
||||
title = f"{arguments} " + title
|
||||
|
||||
try:
|
||||
results = {}
|
||||
@@ -182,7 +202,7 @@ if __name__ == "__main__":
|
||||
total[result_key] += job_result[result_key]
|
||||
|
||||
if total["failed"] != 0 or scheduled:
|
||||
to_be_sent_to_slack = format_for_slack(total, results, scheduled)
|
||||
to_be_sent_to_slack = format_for_slack(total, results, scheduled, title)
|
||||
|
||||
result = client.chat_postMessage(
|
||||
channel=channel_id,
|
||||
|
||||
Reference in New Issue
Block a user