From 648bd5a8aad926d611d900d3d202dce37e085359 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Wed, 19 Apr 2023 19:27:37 +0200 Subject: [PATCH] Show diff between 2 CI runs on Slack reports (#22798) fix Co-authored-by: ydshieh --- .github/workflows/self-scheduled.yml | 11 +++ utils/get_ci_error_statistics.py | 22 ++--- utils/get_previous_daily_ci.py | 70 +++++++++++++++ utils/notification_service.py | 130 ++++++++++++++++++++++----- 4 files changed, 197 insertions(+), 36 deletions(-) create mode 100644 utils/get_previous_daily_ci.py diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index cdbde587a5..f244b4af91 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -487,12 +487,23 @@ jobs: CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} CI_EVENT: scheduled + CI_SHA: ${{ github.sha }} + CI_WORKFLOW_REF: ${{ github.workflow_ref }} RUNNER_STATUS: ${{ needs.check_runner_status.result }} RUNNER_ENV_STATUS: ${{ needs.check_runners.result }} SETUP_STATUS: ${{ needs.setup.result }} # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. run: | + sudo apt-get install -y curl pip install slack_sdk pip show slack_sdk python utils/notification_service.py "${{ needs.setup.outputs.matrix }}" + + # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack. + - name: Failure table artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: test_failure_tables + path: test_failure_tables diff --git a/utils/get_ci_error_statistics.py b/utils/get_ci_error_statistics.py index e9dc52b5bb..93884dda1d 100644 --- a/utils/get_ci_error_statistics.py +++ b/utils/get_ci_error_statistics.py @@ -2,7 +2,6 @@ import argparse import json import math import os -import subprocess import time import traceback import zipfile @@ -70,19 +69,16 @@ def download_artifact(artifact_name, artifact_url, output_dir, token): but it can't be used to download directly. We need to get a redirect URL first. See https://docs.github.com/en/rest/actions/artifacts#download-an-artifact """ - # Get the redirect URL first - cmd = f'curl -v -H "Accept: application/vnd.github+json" -H "Authorization: Bearer {token}" {artifact_url}' - output = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - o = output.stdout.decode("utf-8") - lines = o.splitlines() + headers = None + if token is not None: + headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"} - for line in lines: - if line.startswith("< Location: "): - redirect_url = line[len("< Location: ") :] - r = requests.get(redirect_url, allow_redirects=True) - p = os.path.join(output_dir, f"{artifact_name}.zip") - open(p, "wb").write(r.content) - break + result = requests.get(artifact_url, headers=headers, allow_redirects=False) + download_url = result.headers["Location"] + response = requests.get(download_url, allow_redirects=True) + file_path = os.path.join(output_dir, f"{artifact_name}.zip") + with open(file_path, "wb") as fp: + fp.write(response.content) def get_errors_from_single_artifact(artifact_zip_path, job_links=None): diff --git a/utils/get_previous_daily_ci.py b/utils/get_previous_daily_ci.py new file mode 100644 index 0000000000..0bcb7fc890 --- /dev/null +++ b/utils/get_previous_daily_ci.py @@ -0,0 +1,70 @@ +import os +import zipfile + +import requests +from get_ci_error_statistics import download_artifact, get_artifacts_links + + +def get_daily_ci_runs(token, num_runs=7): + """Get the workflow runs of the scheduled (daily) CI. + + This only selects the runs triggered by the `schedule` event on the `main` branch. + """ + headers = None + if token is not None: + headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"} + + # The id of a workflow (not of a workflow run) + workflow_id = "636036" + + url = f"https://api.github.com/repos/huggingface/transformers/actions/workflows/{workflow_id}/runs" + # On `main` branch + event being `schedule` + not returning PRs + only `num_runs` results + url += f"?branch=main&event=schedule&exclude_pull_requests=true&per_page={num_runs}" + + result = requests.get(url, headers=headers).json() + + return result["workflow_runs"] + + +def get_last_daily_ci_runs(token): + """Get the last completed workflow run id of the scheduled (daily) CI.""" + workflow_runs = get_daily_ci_runs(token) + workflow_run_id = None + for workflow_run in workflow_runs: + if workflow_run["status"] == "completed": + workflow_run_id = workflow_run["id"] + break + + return workflow_run_id + + +def get_last_daily_ci_artifacts(artifact_names, output_dir, token): + """Get the artifacts of last completed workflow run id of the scheduled (daily) CI.""" + workflow_run_id = get_last_daily_ci_runs(token) + if workflow_run_id is not None: + artifacts_links = get_artifacts_links(worflow_run_id=workflow_run_id, token=token) + for artifact_name in artifact_names: + if artifact_name in artifacts_links: + artifact_url = artifacts_links[artifact_name] + download_artifact( + artifact_name=artifact_name, artifact_url=artifact_url, output_dir=output_dir, token=token + ) + + +def get_last_daily_ci_reports(artifact_names, output_dir, token): + """Get the artifacts' content of the last completed workflow run id of the scheduled (daily) CI.""" + get_last_daily_ci_artifacts(artifact_names, output_dir, token) + + results = {} + for artifact_name in artifact_names: + results[artifact_name] = {} + artifact_zip_path = os.path.join(output_dir, f"{artifact_name}.zip") + if os.path.isfile(artifact_zip_path): + with zipfile.ZipFile(artifact_zip_path) as z: + for filename in z.namelist(): + if not os.path.isdir(filename): + # read the file + with z.open(filename) as f: + results[artifact_name][filename] = f.read().decode("UTF-8") + + return results diff --git a/utils/notification_service.py b/utils/notification_service.py index 7251b4d400..bbdd4e996d 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -25,6 +25,7 @@ from typing import Dict, List, Optional, Union import requests from get_ci_error_statistics import get_job_links +from get_previous_daily_ci import get_last_daily_ci_reports from slack_sdk import WebClient @@ -274,6 +275,43 @@ class Message: return {"type": "section", "text": {"type": "mrkdwn", "text": category_failures_report}} + def compute_diff_for_failure_reports(self, curr_failure_report, prev_failure_report): # noqa + # Remove the leading and training parts that don't contain failure count information. + model_failures = curr_failure_report.split("\n")[3:-2] + prev_model_failures = prev_failure_report.split("\n")[3:-2] + entries_changed = set(model_failures).difference(prev_model_failures) + + prev_map = {} + for f in prev_model_failures: + items = [x.strip() for x in f.split("| ")] + prev_map[items[-1]] = [int(x) for x in items[:-1]] + + curr_map = {} + for f in entries_changed: + items = [x.strip() for x in f.split("| ")] + curr_map[items[-1]] = [int(x) for x in items[:-1]] + + diff_map = {} + for k, v in curr_map.items(): + if k not in prev_map: + diff_map[k] = v + else: + diff = [x - y for x, y in zip(v, prev_map[k])] + if max(diff) > 0: + diff_map[k] = diff + + entries_changed = [] + for model_name, diff_values in diff_map.items(): + diff = [str(x) for x in diff_values] + diff = [f"+{x}" if (x != "0" and not x.startswith("-")) else x for x in diff] + diff = [x.rjust(9) for x in diff] + device_report = " | ".join(diff) + " | " + report = f"{device_report}{model_name}" + entries_changed.append(report) + entries_changed = sorted(entries_changed, key=lambda s: s.split("| ")[-1]) + + return entries_changed + @property def model_failures(self) -> Dict: # Obtain per-model failures @@ -331,44 +369,86 @@ class Message: model_reports.append(report) + # (Possibly truncated) reports for the current workflow run - to be sent to Slack channels model_header = "Single PT | Multi PT | Single TF | Multi TF | Other | Category\n" - sorted_model_reports = sorted(model_reports, key=lambda s: s.split("] ")[-1]) + sorted_model_reports = sorted(model_reports, key=lambda s: s.split("| ")[-1]) model_failures_report = prepare_reports( title="These following model modules had failures", header=model_header, reports=sorted_model_reports ) module_header = "Single | Multi | Category\n" - sorted_module_reports = sorted(other_module_reports, key=lambda s: s.split("] ")[-1]) + sorted_module_reports = sorted(other_module_reports, key=lambda s: s.split("| ")[-1]) module_failures_report = prepare_reports( title="The following non-model modules had failures", header=module_header, reports=sorted_module_reports ) + # To be sent to Slack channels model_failure_sections = [ {"type": "section", "text": {"type": "mrkdwn", "text": model_failures_report}}, {"type": "section", "text": {"type": "mrkdwn", "text": module_failures_report}}, ] - # Save complete tables (for past CI) - to be uploaded as artifacts - if ci_event.startswith("Past CI"): - model_failures_report = prepare_reports( - title="These following model modules had failures", - header=model_header, - reports=sorted_model_reports, - to_truncate=False, - ) - file_path = os.path.join(os.getcwd(), "test_failure_tables/model_failures_report.txt") - with open(file_path, "w", encoding="UTF-8") as fp: - fp.write(model_failures_report) + # Save the complete (i.e. no truncation) failure tables (of the current workflow run) + # (to be uploaded as artifacts) + if not os.path.isdir(os.path.join(os.getcwd(), "test_failure_tables")): + os.makedirs(os.path.join(os.getcwd(), "test_failure_tables")) - module_failures_report = prepare_reports( - title="The following non-model modules had failures", - header=module_header, - reports=sorted_module_reports, - to_truncate=False, + model_failures_report = prepare_reports( + title="These following model modules had failures", + header=model_header, + reports=sorted_model_reports, + to_truncate=False, + ) + file_path = os.path.join(os.getcwd(), "test_failure_tables/model_failures_report.txt") + with open(file_path, "w", encoding="UTF-8") as fp: + fp.write(model_failures_report) + + module_failures_report = prepare_reports( + title="The following non-model modules had failures", + header=module_header, + reports=sorted_module_reports, + to_truncate=False, + ) + file_path = os.path.join(os.getcwd(), "test_failure_tables/module_failures_report.txt") + with open(file_path, "w", encoding="UTF-8") as fp: + fp.write(module_failures_report) + + target_workflow = "huggingface/transformers/.github/workflows/self-scheduled.yml@refs/heads/main" + if os.environ.get("CI_WORKFLOW_REF") == target_workflow: + # Get the last previously completed CI's failure tables + artifact_names = ["test_failure_tables"] + output_dir = os.path.join(os.getcwd(), "previous_reports") + os.makedirs(output_dir, exist_ok=True) + prev_tables = get_last_daily_ci_reports( + artifact_names=artifact_names, output_dir=output_dir, token=os.environ["ACCESS_REPO_INFO_TOKEN"] ) - file_path = os.path.join(os.getcwd(), "test_failure_tables/module_failures_report.txt") - with open(file_path, "w", encoding="UTF-8") as fp: - fp.write(module_failures_report) + + # The last run doesn't produce `test_failure_tables` (by some issues or have no model failure at all) + if len(prev_tables) > 0: + # Compute the difference of the previous/current (model failure) table + prev_model_failures = prev_tables["test_failure_tables"]["model_failures_report.txt"] + entries_changed = self.compute_diff_for_failure_reports(model_failures_report, prev_model_failures) + if len(entries_changed) > 0: + # Save the complete difference + diff_report = prepare_reports( + title="Changed model modules failures", + header=model_header, + reports=entries_changed, + to_truncate=False, + ) + file_path = os.path.join(os.getcwd(), "test_failure_tables/changed_model_failures_report.txt") + with open(file_path, "w", encoding="UTF-8") as fp: + fp.write(diff_report) + + # To be sent to Slack channels + diff_report = prepare_reports( + title="*Changed model modules failures*", + header=model_header, + reports=entries_changed, + ) + model_failure_sections.append( + {"type": "section", "text": {"type": "mrkdwn", "text": diff_report}}, + ) return model_failure_sections @@ -487,14 +567,15 @@ class Message: ) def post(self): + payload = self.payload print("Sending the following payload") - print(json.dumps({"blocks": json.loads(self.payload)})) + print(json.dumps({"blocks": json.loads(payload)})) text = f"{self.n_failures} failures out of {self.n_tests} tests," if self.n_failures else "All tests passed." self.thread_ts = client.chat_postMessage( channel=os.environ["CI_SLACK_REPORT_CHANNEL_ID"], - blocks=self.payload, + blocks=payload, text=text, ) @@ -748,6 +829,9 @@ if __name__ == "__main__": else: ci_title = f"<{ci_url}|{ci_title}>\nAuthor: {ci_author} | Merged by: {merged_by}" + elif ci_sha: + ci_title = f"<{ci_url}|commit: {ci_sha}>" + else: ci_title = ""