Show diff between 2 CI runs on Slack reports (#22798)
fix Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
11
.github/workflows/self-scheduled.yml
vendored
11
.github/workflows/self-scheduled.yml
vendored
@@ -487,12 +487,23 @@ jobs:
|
|||||||
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
|
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
|
||||||
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||||
CI_EVENT: scheduled
|
CI_EVENT: scheduled
|
||||||
|
CI_SHA: ${{ github.sha }}
|
||||||
|
CI_WORKFLOW_REF: ${{ github.workflow_ref }}
|
||||||
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
|
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
|
||||||
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
|
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
|
||||||
SETUP_STATUS: ${{ needs.setup.result }}
|
SETUP_STATUS: ${{ needs.setup.result }}
|
||||||
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
|
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
|
||||||
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
|
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
|
||||||
run: |
|
run: |
|
||||||
|
sudo apt-get install -y curl
|
||||||
pip install slack_sdk
|
pip install slack_sdk
|
||||||
pip show slack_sdk
|
pip show slack_sdk
|
||||||
python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
|
python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
|
||||||
|
|
||||||
|
# Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
|
||||||
|
- name: Failure table artifacts
|
||||||
|
if: ${{ always() }}
|
||||||
|
uses: actions/upload-artifact@v3
|
||||||
|
with:
|
||||||
|
name: test_failure_tables
|
||||||
|
path: test_failure_tables
|
||||||
|
|||||||
@@ -2,7 +2,6 @@ import argparse
|
|||||||
import json
|
import json
|
||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
import subprocess
|
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
import zipfile
|
import zipfile
|
||||||
@@ -70,19 +69,16 @@ def download_artifact(artifact_name, artifact_url, output_dir, token):
|
|||||||
but it can't be used to download directly. We need to get a redirect URL first.
|
but it can't be used to download directly. We need to get a redirect URL first.
|
||||||
See https://docs.github.com/en/rest/actions/artifacts#download-an-artifact
|
See https://docs.github.com/en/rest/actions/artifacts#download-an-artifact
|
||||||
"""
|
"""
|
||||||
# Get the redirect URL first
|
headers = None
|
||||||
cmd = f'curl -v -H "Accept: application/vnd.github+json" -H "Authorization: Bearer {token}" {artifact_url}'
|
if token is not None:
|
||||||
output = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"}
|
||||||
o = output.stdout.decode("utf-8")
|
|
||||||
lines = o.splitlines()
|
|
||||||
|
|
||||||
for line in lines:
|
result = requests.get(artifact_url, headers=headers, allow_redirects=False)
|
||||||
if line.startswith("< Location: "):
|
download_url = result.headers["Location"]
|
||||||
redirect_url = line[len("< Location: ") :]
|
response = requests.get(download_url, allow_redirects=True)
|
||||||
r = requests.get(redirect_url, allow_redirects=True)
|
file_path = os.path.join(output_dir, f"{artifact_name}.zip")
|
||||||
p = os.path.join(output_dir, f"{artifact_name}.zip")
|
with open(file_path, "wb") as fp:
|
||||||
open(p, "wb").write(r.content)
|
fp.write(response.content)
|
||||||
break
|
|
||||||
|
|
||||||
|
|
||||||
def get_errors_from_single_artifact(artifact_zip_path, job_links=None):
|
def get_errors_from_single_artifact(artifact_zip_path, job_links=None):
|
||||||
|
|||||||
70
utils/get_previous_daily_ci.py
Normal file
70
utils/get_previous_daily_ci.py
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
import os
|
||||||
|
import zipfile
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from get_ci_error_statistics import download_artifact, get_artifacts_links
|
||||||
|
|
||||||
|
|
||||||
|
def get_daily_ci_runs(token, num_runs=7):
|
||||||
|
"""Get the workflow runs of the scheduled (daily) CI.
|
||||||
|
|
||||||
|
This only selects the runs triggered by the `schedule` event on the `main` branch.
|
||||||
|
"""
|
||||||
|
headers = None
|
||||||
|
if token is not None:
|
||||||
|
headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"}
|
||||||
|
|
||||||
|
# The id of a workflow (not of a workflow run)
|
||||||
|
workflow_id = "636036"
|
||||||
|
|
||||||
|
url = f"https://api.github.com/repos/huggingface/transformers/actions/workflows/{workflow_id}/runs"
|
||||||
|
# On `main` branch + event being `schedule` + not returning PRs + only `num_runs` results
|
||||||
|
url += f"?branch=main&event=schedule&exclude_pull_requests=true&per_page={num_runs}"
|
||||||
|
|
||||||
|
result = requests.get(url, headers=headers).json()
|
||||||
|
|
||||||
|
return result["workflow_runs"]
|
||||||
|
|
||||||
|
|
||||||
|
def get_last_daily_ci_runs(token):
|
||||||
|
"""Get the last completed workflow run id of the scheduled (daily) CI."""
|
||||||
|
workflow_runs = get_daily_ci_runs(token)
|
||||||
|
workflow_run_id = None
|
||||||
|
for workflow_run in workflow_runs:
|
||||||
|
if workflow_run["status"] == "completed":
|
||||||
|
workflow_run_id = workflow_run["id"]
|
||||||
|
break
|
||||||
|
|
||||||
|
return workflow_run_id
|
||||||
|
|
||||||
|
|
||||||
|
def get_last_daily_ci_artifacts(artifact_names, output_dir, token):
|
||||||
|
"""Get the artifacts of last completed workflow run id of the scheduled (daily) CI."""
|
||||||
|
workflow_run_id = get_last_daily_ci_runs(token)
|
||||||
|
if workflow_run_id is not None:
|
||||||
|
artifacts_links = get_artifacts_links(worflow_run_id=workflow_run_id, token=token)
|
||||||
|
for artifact_name in artifact_names:
|
||||||
|
if artifact_name in artifacts_links:
|
||||||
|
artifact_url = artifacts_links[artifact_name]
|
||||||
|
download_artifact(
|
||||||
|
artifact_name=artifact_name, artifact_url=artifact_url, output_dir=output_dir, token=token
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_last_daily_ci_reports(artifact_names, output_dir, token):
|
||||||
|
"""Get the artifacts' content of the last completed workflow run id of the scheduled (daily) CI."""
|
||||||
|
get_last_daily_ci_artifacts(artifact_names, output_dir, token)
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
for artifact_name in artifact_names:
|
||||||
|
results[artifact_name] = {}
|
||||||
|
artifact_zip_path = os.path.join(output_dir, f"{artifact_name}.zip")
|
||||||
|
if os.path.isfile(artifact_zip_path):
|
||||||
|
with zipfile.ZipFile(artifact_zip_path) as z:
|
||||||
|
for filename in z.namelist():
|
||||||
|
if not os.path.isdir(filename):
|
||||||
|
# read the file
|
||||||
|
with z.open(filename) as f:
|
||||||
|
results[artifact_name][filename] = f.read().decode("UTF-8")
|
||||||
|
|
||||||
|
return results
|
||||||
@@ -25,6 +25,7 @@ from typing import Dict, List, Optional, Union
|
|||||||
|
|
||||||
import requests
|
import requests
|
||||||
from get_ci_error_statistics import get_job_links
|
from get_ci_error_statistics import get_job_links
|
||||||
|
from get_previous_daily_ci import get_last_daily_ci_reports
|
||||||
from slack_sdk import WebClient
|
from slack_sdk import WebClient
|
||||||
|
|
||||||
|
|
||||||
@@ -274,6 +275,43 @@ class Message:
|
|||||||
|
|
||||||
return {"type": "section", "text": {"type": "mrkdwn", "text": category_failures_report}}
|
return {"type": "section", "text": {"type": "mrkdwn", "text": category_failures_report}}
|
||||||
|
|
||||||
|
def compute_diff_for_failure_reports(self, curr_failure_report, prev_failure_report): # noqa
|
||||||
|
# Remove the leading and training parts that don't contain failure count information.
|
||||||
|
model_failures = curr_failure_report.split("\n")[3:-2]
|
||||||
|
prev_model_failures = prev_failure_report.split("\n")[3:-2]
|
||||||
|
entries_changed = set(model_failures).difference(prev_model_failures)
|
||||||
|
|
||||||
|
prev_map = {}
|
||||||
|
for f in prev_model_failures:
|
||||||
|
items = [x.strip() for x in f.split("| ")]
|
||||||
|
prev_map[items[-1]] = [int(x) for x in items[:-1]]
|
||||||
|
|
||||||
|
curr_map = {}
|
||||||
|
for f in entries_changed:
|
||||||
|
items = [x.strip() for x in f.split("| ")]
|
||||||
|
curr_map[items[-1]] = [int(x) for x in items[:-1]]
|
||||||
|
|
||||||
|
diff_map = {}
|
||||||
|
for k, v in curr_map.items():
|
||||||
|
if k not in prev_map:
|
||||||
|
diff_map[k] = v
|
||||||
|
else:
|
||||||
|
diff = [x - y for x, y in zip(v, prev_map[k])]
|
||||||
|
if max(diff) > 0:
|
||||||
|
diff_map[k] = diff
|
||||||
|
|
||||||
|
entries_changed = []
|
||||||
|
for model_name, diff_values in diff_map.items():
|
||||||
|
diff = [str(x) for x in diff_values]
|
||||||
|
diff = [f"+{x}" if (x != "0" and not x.startswith("-")) else x for x in diff]
|
||||||
|
diff = [x.rjust(9) for x in diff]
|
||||||
|
device_report = " | ".join(diff) + " | "
|
||||||
|
report = f"{device_report}{model_name}"
|
||||||
|
entries_changed.append(report)
|
||||||
|
entries_changed = sorted(entries_changed, key=lambda s: s.split("| ")[-1])
|
||||||
|
|
||||||
|
return entries_changed
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def model_failures(self) -> Dict:
|
def model_failures(self) -> Dict:
|
||||||
# Obtain per-model failures
|
# Obtain per-model failures
|
||||||
@@ -331,44 +369,86 @@ class Message:
|
|||||||
|
|
||||||
model_reports.append(report)
|
model_reports.append(report)
|
||||||
|
|
||||||
|
# (Possibly truncated) reports for the current workflow run - to be sent to Slack channels
|
||||||
model_header = "Single PT | Multi PT | Single TF | Multi TF | Other | Category\n"
|
model_header = "Single PT | Multi PT | Single TF | Multi TF | Other | Category\n"
|
||||||
sorted_model_reports = sorted(model_reports, key=lambda s: s.split("] ")[-1])
|
sorted_model_reports = sorted(model_reports, key=lambda s: s.split("| ")[-1])
|
||||||
model_failures_report = prepare_reports(
|
model_failures_report = prepare_reports(
|
||||||
title="These following model modules had failures", header=model_header, reports=sorted_model_reports
|
title="These following model modules had failures", header=model_header, reports=sorted_model_reports
|
||||||
)
|
)
|
||||||
|
|
||||||
module_header = "Single | Multi | Category\n"
|
module_header = "Single | Multi | Category\n"
|
||||||
sorted_module_reports = sorted(other_module_reports, key=lambda s: s.split("] ")[-1])
|
sorted_module_reports = sorted(other_module_reports, key=lambda s: s.split("| ")[-1])
|
||||||
module_failures_report = prepare_reports(
|
module_failures_report = prepare_reports(
|
||||||
title="The following non-model modules had failures", header=module_header, reports=sorted_module_reports
|
title="The following non-model modules had failures", header=module_header, reports=sorted_module_reports
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# To be sent to Slack channels
|
||||||
model_failure_sections = [
|
model_failure_sections = [
|
||||||
{"type": "section", "text": {"type": "mrkdwn", "text": model_failures_report}},
|
{"type": "section", "text": {"type": "mrkdwn", "text": model_failures_report}},
|
||||||
{"type": "section", "text": {"type": "mrkdwn", "text": module_failures_report}},
|
{"type": "section", "text": {"type": "mrkdwn", "text": module_failures_report}},
|
||||||
]
|
]
|
||||||
|
|
||||||
# Save complete tables (for past CI) - to be uploaded as artifacts
|
# Save the complete (i.e. no truncation) failure tables (of the current workflow run)
|
||||||
if ci_event.startswith("Past CI"):
|
# (to be uploaded as artifacts)
|
||||||
model_failures_report = prepare_reports(
|
if not os.path.isdir(os.path.join(os.getcwd(), "test_failure_tables")):
|
||||||
title="These following model modules had failures",
|
os.makedirs(os.path.join(os.getcwd(), "test_failure_tables"))
|
||||||
header=model_header,
|
|
||||||
reports=sorted_model_reports,
|
|
||||||
to_truncate=False,
|
|
||||||
)
|
|
||||||
file_path = os.path.join(os.getcwd(), "test_failure_tables/model_failures_report.txt")
|
|
||||||
with open(file_path, "w", encoding="UTF-8") as fp:
|
|
||||||
fp.write(model_failures_report)
|
|
||||||
|
|
||||||
module_failures_report = prepare_reports(
|
model_failures_report = prepare_reports(
|
||||||
title="The following non-model modules had failures",
|
title="These following model modules had failures",
|
||||||
header=module_header,
|
header=model_header,
|
||||||
reports=sorted_module_reports,
|
reports=sorted_model_reports,
|
||||||
to_truncate=False,
|
to_truncate=False,
|
||||||
|
)
|
||||||
|
file_path = os.path.join(os.getcwd(), "test_failure_tables/model_failures_report.txt")
|
||||||
|
with open(file_path, "w", encoding="UTF-8") as fp:
|
||||||
|
fp.write(model_failures_report)
|
||||||
|
|
||||||
|
module_failures_report = prepare_reports(
|
||||||
|
title="The following non-model modules had failures",
|
||||||
|
header=module_header,
|
||||||
|
reports=sorted_module_reports,
|
||||||
|
to_truncate=False,
|
||||||
|
)
|
||||||
|
file_path = os.path.join(os.getcwd(), "test_failure_tables/module_failures_report.txt")
|
||||||
|
with open(file_path, "w", encoding="UTF-8") as fp:
|
||||||
|
fp.write(module_failures_report)
|
||||||
|
|
||||||
|
target_workflow = "huggingface/transformers/.github/workflows/self-scheduled.yml@refs/heads/main"
|
||||||
|
if os.environ.get("CI_WORKFLOW_REF") == target_workflow:
|
||||||
|
# Get the last previously completed CI's failure tables
|
||||||
|
artifact_names = ["test_failure_tables"]
|
||||||
|
output_dir = os.path.join(os.getcwd(), "previous_reports")
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
prev_tables = get_last_daily_ci_reports(
|
||||||
|
artifact_names=artifact_names, output_dir=output_dir, token=os.environ["ACCESS_REPO_INFO_TOKEN"]
|
||||||
)
|
)
|
||||||
file_path = os.path.join(os.getcwd(), "test_failure_tables/module_failures_report.txt")
|
|
||||||
with open(file_path, "w", encoding="UTF-8") as fp:
|
# The last run doesn't produce `test_failure_tables` (by some issues or have no model failure at all)
|
||||||
fp.write(module_failures_report)
|
if len(prev_tables) > 0:
|
||||||
|
# Compute the difference of the previous/current (model failure) table
|
||||||
|
prev_model_failures = prev_tables["test_failure_tables"]["model_failures_report.txt"]
|
||||||
|
entries_changed = self.compute_diff_for_failure_reports(model_failures_report, prev_model_failures)
|
||||||
|
if len(entries_changed) > 0:
|
||||||
|
# Save the complete difference
|
||||||
|
diff_report = prepare_reports(
|
||||||
|
title="Changed model modules failures",
|
||||||
|
header=model_header,
|
||||||
|
reports=entries_changed,
|
||||||
|
to_truncate=False,
|
||||||
|
)
|
||||||
|
file_path = os.path.join(os.getcwd(), "test_failure_tables/changed_model_failures_report.txt")
|
||||||
|
with open(file_path, "w", encoding="UTF-8") as fp:
|
||||||
|
fp.write(diff_report)
|
||||||
|
|
||||||
|
# To be sent to Slack channels
|
||||||
|
diff_report = prepare_reports(
|
||||||
|
title="*Changed model modules failures*",
|
||||||
|
header=model_header,
|
||||||
|
reports=entries_changed,
|
||||||
|
)
|
||||||
|
model_failure_sections.append(
|
||||||
|
{"type": "section", "text": {"type": "mrkdwn", "text": diff_report}},
|
||||||
|
)
|
||||||
|
|
||||||
return model_failure_sections
|
return model_failure_sections
|
||||||
|
|
||||||
@@ -487,14 +567,15 @@ class Message:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def post(self):
|
def post(self):
|
||||||
|
payload = self.payload
|
||||||
print("Sending the following payload")
|
print("Sending the following payload")
|
||||||
print(json.dumps({"blocks": json.loads(self.payload)}))
|
print(json.dumps({"blocks": json.loads(payload)}))
|
||||||
|
|
||||||
text = f"{self.n_failures} failures out of {self.n_tests} tests," if self.n_failures else "All tests passed."
|
text = f"{self.n_failures} failures out of {self.n_tests} tests," if self.n_failures else "All tests passed."
|
||||||
|
|
||||||
self.thread_ts = client.chat_postMessage(
|
self.thread_ts = client.chat_postMessage(
|
||||||
channel=os.environ["CI_SLACK_REPORT_CHANNEL_ID"],
|
channel=os.environ["CI_SLACK_REPORT_CHANNEL_ID"],
|
||||||
blocks=self.payload,
|
blocks=payload,
|
||||||
text=text,
|
text=text,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -748,6 +829,9 @@ if __name__ == "__main__":
|
|||||||
else:
|
else:
|
||||||
ci_title = f"<{ci_url}|{ci_title}>\nAuthor: {ci_author} | Merged by: {merged_by}"
|
ci_title = f"<{ci_url}|{ci_title}>\nAuthor: {ci_author} | Merged by: {merged_by}"
|
||||||
|
|
||||||
|
elif ci_sha:
|
||||||
|
ci_title = f"<{ci_url}|commit: {ci_sha}>"
|
||||||
|
|
||||||
else:
|
else:
|
||||||
ci_title = ""
|
ci_title = ""
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user