Nightly torch ci (#13550)
* Nightly CI torch * Version * Reformat * Only subset Fix * Revert * Better formatting * New channel
This commit is contained in:
255
.github/workflows/self-nightly-scheduled.yml
vendored
Normal file
255
.github/workflows/self-nightly-scheduled.yml
vendored
Normal file
@@ -0,0 +1,255 @@
|
|||||||
|
name: Self-hosted runner; Nightly (scheduled)
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- nightly_ci*
|
||||||
|
repository_dispatch:
|
||||||
|
schedule:
|
||||||
|
- cron: "0 0 */3 * *"
|
||||||
|
|
||||||
|
env:
|
||||||
|
HF_HOME: /mnt/cache
|
||||||
|
TRANSFORMERS_IS_CI: yes
|
||||||
|
RUN_SLOW: yes
|
||||||
|
OMP_NUM_THREADS: 16
|
||||||
|
MKL_NUM_THREADS: 16
|
||||||
|
PYTEST_TIMEOUT: 600
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
run_all_tests_torch_gpu:
|
||||||
|
runs-on: [self-hosted, docker-gpu, single-gpu]
|
||||||
|
container:
|
||||||
|
image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
|
||||||
|
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
|
steps:
|
||||||
|
- name: Launcher docker
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
|
||||||
|
- name: NVIDIA-SMI
|
||||||
|
run: |
|
||||||
|
nvidia-smi
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
apt -y update && apt install -y libsndfile1-dev git
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
|
||||||
|
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
|
||||||
|
|
||||||
|
- name: Are GPUs recognized by our DL frameworks
|
||||||
|
run: |
|
||||||
|
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
|
||||||
|
python -c "import torch; print('Cuda version:', torch.version.cuda)"
|
||||||
|
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
|
||||||
|
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
|
||||||
|
|
||||||
|
- name: Run all tests on GPU
|
||||||
|
run: |
|
||||||
|
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_gpu tests
|
||||||
|
|
||||||
|
- name: Failure short reports
|
||||||
|
if: ${{ always() }}
|
||||||
|
run: cat reports/tests_torch_gpu_failures_short.txt
|
||||||
|
|
||||||
|
- name: Run examples tests on GPU
|
||||||
|
if: ${{ always() }}
|
||||||
|
env:
|
||||||
|
OMP_NUM_THREADS: 16
|
||||||
|
MKL_NUM_THREADS: 16
|
||||||
|
RUN_SLOW: yes
|
||||||
|
HF_HOME: /mnt/cache
|
||||||
|
TRANSFORMERS_IS_CI: yes
|
||||||
|
run: |
|
||||||
|
pip install -r examples/pytorch/_tests_requirements.txt
|
||||||
|
python -m pytest -n 1 -v --dist=loadfile --make-reports=examples_torch_gpu examples
|
||||||
|
|
||||||
|
- name: Failure short reports
|
||||||
|
if: ${{ always() }}
|
||||||
|
run: cat reports/examples_torch_gpu_failures_short.txt
|
||||||
|
|
||||||
|
- name: Run all pipeline tests on GPU
|
||||||
|
if: ${{ always() }}
|
||||||
|
env:
|
||||||
|
RUN_PIPELINE_TESTS: yes
|
||||||
|
run: |
|
||||||
|
python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
|
||||||
|
|
||||||
|
- name: Failure short reports
|
||||||
|
if: ${{ always() }}
|
||||||
|
run: cat reports/tests_torch_pipeline_gpu_failures_short.txt
|
||||||
|
|
||||||
|
- name: Test suite reports artifacts
|
||||||
|
if: ${{ always() }}
|
||||||
|
uses: actions/upload-artifact@v2
|
||||||
|
with:
|
||||||
|
name: run_all_tests_torch_gpu_test_reports
|
||||||
|
path: reports
|
||||||
|
|
||||||
|
run_all_tests_torch_multi_gpu:
|
||||||
|
runs-on: [self-hosted, docker-gpu, multi-gpu]
|
||||||
|
container:
|
||||||
|
image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
|
||||||
|
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
|
steps:
|
||||||
|
- name: Launcher docker
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
|
||||||
|
- name: NVIDIA-SMI
|
||||||
|
continue-on-error: true
|
||||||
|
run: |
|
||||||
|
nvidia-smi
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
apt -y update && apt install -y libsndfile1-dev git
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
|
||||||
|
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
|
||||||
|
|
||||||
|
- name: Are GPUs recognized by our DL frameworks
|
||||||
|
run: |
|
||||||
|
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
|
||||||
|
python -c "import torch; print('Cuda version:', torch.version.cuda)"
|
||||||
|
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
|
||||||
|
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
|
||||||
|
|
||||||
|
- name: Run all tests on GPU
|
||||||
|
env:
|
||||||
|
MKL_SERVICE_FORCE_INTEL: 1
|
||||||
|
run: |
|
||||||
|
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_multi_gpu tests
|
||||||
|
|
||||||
|
- name: Failure short reports
|
||||||
|
if: ${{ always() }}
|
||||||
|
run: cat reports/tests_torch_multi_gpu_failures_short.txt
|
||||||
|
|
||||||
|
- name: Run all pipeline tests on GPU
|
||||||
|
if: ${{ always() }}
|
||||||
|
env:
|
||||||
|
RUN_PIPELINE_TESTS: yes
|
||||||
|
run: |
|
||||||
|
python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
|
||||||
|
|
||||||
|
- name: Failure short reports
|
||||||
|
if: ${{ always() }}
|
||||||
|
run: cat reports/tests_torch_pipeline_multi_gpu_failures_short.txt
|
||||||
|
|
||||||
|
- name: Test suite reports artifacts
|
||||||
|
if: ${{ always() }}
|
||||||
|
uses: actions/upload-artifact@v2
|
||||||
|
with:
|
||||||
|
name: run_all_tests_torch_multi_gpu_test_reports
|
||||||
|
path: reports
|
||||||
|
|
||||||
|
run_all_tests_torch_cuda_extensions_gpu:
|
||||||
|
runs-on: [self-hosted, docker-gpu, single-gpu]
|
||||||
|
container:
|
||||||
|
image: nvcr.io/nvidia/pytorch:21.03-py3
|
||||||
|
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
|
steps:
|
||||||
|
- name: Launcher docker
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
|
||||||
|
- name: NVIDIA-SMI
|
||||||
|
run: |
|
||||||
|
nvidia-smi
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
apt -y update && apt install -y libaio-dev
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install .[testing,deepspeed]
|
||||||
|
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
|
||||||
|
|
||||||
|
|
||||||
|
- name: Are GPUs recognized by our DL frameworks
|
||||||
|
run: |
|
||||||
|
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
|
||||||
|
python -c "import torch; print('Cuda version:', torch.version.cuda)"
|
||||||
|
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
|
||||||
|
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
|
||||||
|
|
||||||
|
- name: Run all tests on GPU
|
||||||
|
run: |
|
||||||
|
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
|
||||||
|
|
||||||
|
- name: Failure short reports
|
||||||
|
if: ${{ always() }}
|
||||||
|
run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt
|
||||||
|
|
||||||
|
- name: Test suite reports artifacts
|
||||||
|
if: ${{ always() }}
|
||||||
|
uses: actions/upload-artifact@v2
|
||||||
|
with:
|
||||||
|
name: run_tests_torch_cuda_extensions_gpu_test_reports
|
||||||
|
path: reports
|
||||||
|
|
||||||
|
run_all_tests_torch_cuda_extensions_multi_gpu:
|
||||||
|
runs-on: [self-hosted, docker-gpu, multi-gpu]
|
||||||
|
container:
|
||||||
|
image: nvcr.io/nvidia/pytorch:21.03-py3
|
||||||
|
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
|
steps:
|
||||||
|
- name: Launcher docker
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
|
||||||
|
- name: NVIDIA-SMI
|
||||||
|
continue-on-error: true
|
||||||
|
run: |
|
||||||
|
nvidia-smi
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
apt -y update && apt install -y libaio-dev
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install .[testing,deepspeed,fairscale]
|
||||||
|
|
||||||
|
- name: Are GPUs recognized by our DL frameworks
|
||||||
|
run: |
|
||||||
|
python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
|
||||||
|
python -c "import torch; print('Cuda version:', torch.version.cuda)"
|
||||||
|
python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
|
||||||
|
python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
|
||||||
|
|
||||||
|
- name: Run all tests on GPU
|
||||||
|
run: |
|
||||||
|
python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
|
||||||
|
pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
|
||||||
|
|
||||||
|
- name: Failure short reports
|
||||||
|
if: ${{ always() }}
|
||||||
|
run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt
|
||||||
|
|
||||||
|
- name: Test suite reports artifacts
|
||||||
|
if: ${{ always() }}
|
||||||
|
uses: actions/upload-artifact@v2
|
||||||
|
with:
|
||||||
|
name: run_tests_torch_cuda_extensions_multi_gpu_test_reports
|
||||||
|
path: reports
|
||||||
|
|
||||||
|
send_results:
|
||||||
|
name: Send results to webhook
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
if: always()
|
||||||
|
needs: [
|
||||||
|
run_all_tests_torch_gpu,
|
||||||
|
run_all_tests_torch_multi_gpu,
|
||||||
|
run_all_tests_torch_cuda_extensions_gpu,
|
||||||
|
run_all_tests_torch_cuda_extensions_multi_gpu
|
||||||
|
]
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
|
||||||
|
- uses: actions/download-artifact@v2
|
||||||
|
|
||||||
|
- name: Send message to Slack
|
||||||
|
env:
|
||||||
|
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
|
||||||
|
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
|
||||||
|
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
|
||||||
|
CI_SLACK_CHANNEL_ID_PAST_FUTURE: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
|
||||||
|
|
||||||
|
run: |
|
||||||
|
pip install slack_sdk
|
||||||
|
python utils/notification_service.py scheduled nightly-torch
|
||||||
@@ -38,13 +38,13 @@ def handle_test_results(test_results):
|
|||||||
return failed, success, time_spent
|
return failed, success, time_spent
|
||||||
|
|
||||||
|
|
||||||
def format_for_slack(total_results, results, scheduled: bool):
|
def format_for_slack(total_results, results, scheduled: bool, title: str):
|
||||||
print(total_results, results)
|
print(total_results, results)
|
||||||
header = {
|
header = {
|
||||||
"type": "header",
|
"type": "header",
|
||||||
"text": {
|
"text": {
|
||||||
"type": "plain_text",
|
"type": "plain_text",
|
||||||
"text": "🤗 Results of the scheduled tests." if scheduled else "🤗 Self-push results",
|
"text": title,
|
||||||
"emoji": True,
|
"emoji": True,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@@ -105,7 +105,13 @@ def format_for_slack(total_results, results, scheduled: bool):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
scheduled = sys.argv[1] == "scheduled"
|
arguments = sys.argv[1:]
|
||||||
|
|
||||||
|
if "scheduled" in arguments:
|
||||||
|
arguments.remove("scheduled")
|
||||||
|
scheduled = True
|
||||||
|
else:
|
||||||
|
scheduled = False
|
||||||
|
|
||||||
if scheduled:
|
if scheduled:
|
||||||
# The scheduled run has several artifacts for each job.
|
# The scheduled run has several artifacts for each job.
|
||||||
@@ -149,7 +155,21 @@ if __name__ == "__main__":
|
|||||||
}
|
}
|
||||||
|
|
||||||
client = WebClient(token=os.environ["CI_SLACK_BOT_TOKEN"])
|
client = WebClient(token=os.environ["CI_SLACK_BOT_TOKEN"])
|
||||||
channel_id = os.environ["CI_SLACK_CHANNEL_ID_DAILY"] if scheduled else os.environ["CI_SLACK_CHANNEL_ID"]
|
|
||||||
|
if not scheduled:
|
||||||
|
channel_id = os.environ["CI_SLACK_CHANNEL_ID"]
|
||||||
|
elif scheduled and len(arguments):
|
||||||
|
channel_id = os.environ["CI_SLACK_CHANNEL_ID_PAST_FUTURE"]
|
||||||
|
else:
|
||||||
|
channel_id = os.environ["CI_SLACK_CHANNEL_ID_DAILY"]
|
||||||
|
|
||||||
|
if scheduled:
|
||||||
|
title = "🤗 Results of the scheduled tests."
|
||||||
|
else:
|
||||||
|
title = "🤗 Self-push results"
|
||||||
|
|
||||||
|
if len(arguments):
|
||||||
|
title = f"{arguments} " + title
|
||||||
|
|
||||||
try:
|
try:
|
||||||
results = {}
|
results = {}
|
||||||
@@ -182,7 +202,7 @@ if __name__ == "__main__":
|
|||||||
total[result_key] += job_result[result_key]
|
total[result_key] += job_result[result_key]
|
||||||
|
|
||||||
if total["failed"] != 0 or scheduled:
|
if total["failed"] != 0 or scheduled:
|
||||||
to_be_sent_to_slack = format_for_slack(total, results, scheduled)
|
to_be_sent_to_slack = format_for_slack(total, results, scheduled, title)
|
||||||
|
|
||||||
result = client.chat_postMessage(
|
result = client.chat_postMessage(
|
||||||
channel=channel_id,
|
channel=channel_id,
|
||||||
|
|||||||
Reference in New Issue
Block a user