Show diff between 2 CI runs on Slack reports (#22798)

fix Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2023-04-19 19:27:37 +02:00
parent 5f97bbc124
commit 648bd5a8aa
4 changed files with 197 additions and 36 deletions
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -487,12 +487,23 @@ jobs:
          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
          CI_EVENT: scheduled
          CI_SHA: ${{ github.sha }}
          CI_WORKFLOW_REF: ${{ github.workflow_ref }}
          RUNNER_STATUS: ${{ needs.check_runner_status.result }}
          RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
          SETUP_STATUS: ${{ needs.setup.result }}
        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
        run: |
          sudo apt-get install -y curl
          pip install slack_sdk
          pip show slack_sdk
          python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
      # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
      - name: Failure table artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v3
        with:
          name: test_failure_tables
          path: test_failure_tables
--- a/utils/get_ci_error_statistics.py
+++ b/utils/get_ci_error_statistics.py
@@ -2,7 +2,6 @@ import argparse
 import json
 import math
 import os
 import subprocess
 import time
 import traceback
 import zipfile
@@ -70,19 +69,16 @@ def download_artifact(artifact_name, artifact_url, output_dir, token):
    but it can't be used to download directly. We need to get a redirect URL first.
    See https://docs.github.com/en/rest/actions/artifacts#download-an-artifact
    """
-    # Get the redirect URL first
+    headers = None
-    cmd = f'curl -v -H "Accept: application/vnd.github+json" -H "Authorization: Bearer {token}" {artifact_url}'
+    if token is not None:
-    output = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+        headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"}
    o = output.stdout.decode("utf-8")
    lines = o.splitlines()
-    for line in lines:
+    result = requests.get(artifact_url, headers=headers, allow_redirects=False)
-        if line.startswith("< Location: "):
+    download_url = result.headers["Location"]
-            redirect_url = line[len("< Location: ") :]
+    response = requests.get(download_url, allow_redirects=True)
-            r = requests.get(redirect_url, allow_redirects=True)
+    file_path = os.path.join(output_dir, f"{artifact_name}.zip")
-            p = os.path.join(output_dir, f"{artifact_name}.zip")
+    with open(file_path, "wb") as fp:
-            open(p, "wb").write(r.content)
+        fp.write(response.content)
            break
 def get_errors_from_single_artifact(artifact_zip_path, job_links=None):
--- a/utils/get_previous_daily_ci.py
+++ b/utils/get_previous_daily_ci.py
@@ -0,0 +1,70 @@
 import os
 import zipfile
 import requests
 from get_ci_error_statistics import download_artifact, get_artifacts_links
 def get_daily_ci_runs(token, num_runs=7):
    """Get the workflow runs of the scheduled (daily) CI.
    This only selects the runs triggered by the `schedule` event on the `main` branch.
    """
    headers = None
    if token is not None:
        headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"}
    # The id of a workflow (not of a workflow run)
    workflow_id = "636036"
    url = f"https://api.github.com/repos/huggingface/transformers/actions/workflows/{workflow_id}/runs"
    # On `main` branch + event being `schedule` + not returning PRs + only `num_runs` results
    url += f"?branch=main&event=schedule&exclude_pull_requests=true&per_page={num_runs}"
    result = requests.get(url, headers=headers).json()
    return result["workflow_runs"]
 def get_last_daily_ci_runs(token):
    """Get the last completed workflow run id of the scheduled (daily) CI."""
    workflow_runs = get_daily_ci_runs(token)
    workflow_run_id = None
    for workflow_run in workflow_runs:
        if workflow_run["status"] == "completed":
            workflow_run_id = workflow_run["id"]
            break
    return workflow_run_id
 def get_last_daily_ci_artifacts(artifact_names, output_dir, token):
    """Get the artifacts of last completed workflow run id of the scheduled (daily) CI."""
    workflow_run_id = get_last_daily_ci_runs(token)
    if workflow_run_id is not None:
        artifacts_links = get_artifacts_links(worflow_run_id=workflow_run_id, token=token)
        for artifact_name in artifact_names:
            if artifact_name in artifacts_links:
                artifact_url = artifacts_links[artifact_name]
                download_artifact(
                    artifact_name=artifact_name, artifact_url=artifact_url, output_dir=output_dir, token=token
                )
 def get_last_daily_ci_reports(artifact_names, output_dir, token):
    """Get the artifacts' content of the last completed workflow run id of the scheduled (daily) CI."""
    get_last_daily_ci_artifacts(artifact_names, output_dir, token)
    results = {}
    for artifact_name in artifact_names:
        results[artifact_name] = {}
        artifact_zip_path = os.path.join(output_dir, f"{artifact_name}.zip")
        if os.path.isfile(artifact_zip_path):
            with zipfile.ZipFile(artifact_zip_path) as z:
                for filename in z.namelist():
                    if not os.path.isdir(filename):
                        # read the file
                        with z.open(filename) as f:
                            results[artifact_name][filename] = f.read().decode("UTF-8")
    return results
--- a/utils/notification_service.py
+++ b/utils/notification_service.py
@@ -25,6 +25,7 @@ from typing import Dict, List, Optional, Union
 import requests
 from get_ci_error_statistics import get_job_links
 from get_previous_daily_ci import get_last_daily_ci_reports
 from slack_sdk import WebClient
@@ -274,6 +275,43 @@ class Message:
        return {"type": "section", "text": {"type": "mrkdwn", "text": category_failures_report}}
    def compute_diff_for_failure_reports(self, curr_failure_report, prev_failure_report):  # noqa
        # Remove the leading and training parts that don't contain failure count information.
        model_failures = curr_failure_report.split("\n")[3:-2]
        prev_model_failures = prev_failure_report.split("\n")[3:-2]
        entries_changed = set(model_failures).difference(prev_model_failures)
        prev_map = {}
        for f in prev_model_failures:
            items = [x.strip() for x in f.split("| ")]
            prev_map[items[-1]] = [int(x) for x in items[:-1]]
        curr_map = {}
        for f in entries_changed:
            items = [x.strip() for x in f.split("| ")]
            curr_map[items[-1]] = [int(x) for x in items[:-1]]
        diff_map = {}
        for k, v in curr_map.items():
            if k not in prev_map:
                diff_map[k] = v
            else:
                diff = [x - y for x, y in zip(v, prev_map[k])]
                if max(diff) > 0:
                    diff_map[k] = diff
        entries_changed = []
        for model_name, diff_values in diff_map.items():
            diff = [str(x) for x in diff_values]
            diff = [f"+{x}" if (x != "0" and not x.startswith("-")) else x for x in diff]
            diff = [x.rjust(9) for x in diff]
            device_report = " | ".join(diff) + " | "
            report = f"{device_report}{model_name}"
            entries_changed.append(report)
        entries_changed = sorted(entries_changed, key=lambda s: s.split("| ")[-1])
        return entries_changed
    @property
    def model_failures(self) -> Dict:
        # Obtain per-model failures
@@ -331,44 +369,86 @@ class Message:
                model_reports.append(report)
        # (Possibly truncated) reports for the current workflow run - to be sent to Slack channels
        model_header = "Single PT |  Multi PT | Single TF |  Multi TF |     Other | Category\n"
-        sorted_model_reports = sorted(model_reports, key=lambda s: s.split("] ")[-1])
+        sorted_model_reports = sorted(model_reports, key=lambda s: s.split("| ")[-1])
        model_failures_report = prepare_reports(
            title="These following model modules had failures", header=model_header, reports=sorted_model_reports
        )
        module_header = "Single |  Multi | Category\n"
-        sorted_module_reports = sorted(other_module_reports, key=lambda s: s.split("] ")[-1])
+        sorted_module_reports = sorted(other_module_reports, key=lambda s: s.split("| ")[-1])
        module_failures_report = prepare_reports(
            title="The following non-model modules had failures", header=module_header, reports=sorted_module_reports
        )
        # To be sent to Slack channels
        model_failure_sections = [
            {"type": "section", "text": {"type": "mrkdwn", "text": model_failures_report}},
            {"type": "section", "text": {"type": "mrkdwn", "text": module_failures_report}},
        ]
-        # Save complete tables (for past CI) - to be uploaded as artifacts
+        # Save the complete (i.e. no truncation) failure tables (of the current workflow run)
-        if ci_event.startswith("Past CI"):
+        # (to be uploaded as artifacts)
-            model_failures_report = prepare_reports(
+        if not os.path.isdir(os.path.join(os.getcwd(), "test_failure_tables")):
-                title="These following model modules had failures",
+            os.makedirs(os.path.join(os.getcwd(), "test_failure_tables"))
                header=model_header,
                reports=sorted_model_reports,
                to_truncate=False,
            )
            file_path = os.path.join(os.getcwd(), "test_failure_tables/model_failures_report.txt")
            with open(file_path, "w", encoding="UTF-8") as fp:
                fp.write(model_failures_report)
-            module_failures_report = prepare_reports(
+        model_failures_report = prepare_reports(
-                title="The following non-model modules had failures",
+            title="These following model modules had failures",
-                header=module_header,
+            header=model_header,
-                reports=sorted_module_reports,
+            reports=sorted_model_reports,
-                to_truncate=False,
+            to_truncate=False,
        )
        file_path = os.path.join(os.getcwd(), "test_failure_tables/model_failures_report.txt")
        with open(file_path, "w", encoding="UTF-8") as fp:
            fp.write(model_failures_report)
        module_failures_report = prepare_reports(
            title="The following non-model modules had failures",
            header=module_header,
            reports=sorted_module_reports,
            to_truncate=False,
        )
        file_path = os.path.join(os.getcwd(), "test_failure_tables/module_failures_report.txt")
        with open(file_path, "w", encoding="UTF-8") as fp:
            fp.write(module_failures_report)
        target_workflow = "huggingface/transformers/.github/workflows/self-scheduled.yml@refs/heads/main"
        if os.environ.get("CI_WORKFLOW_REF") == target_workflow:
            # Get the last previously completed CI's failure tables
            artifact_names = ["test_failure_tables"]
            output_dir = os.path.join(os.getcwd(), "previous_reports")
            os.makedirs(output_dir, exist_ok=True)
            prev_tables = get_last_daily_ci_reports(
                artifact_names=artifact_names, output_dir=output_dir, token=os.environ["ACCESS_REPO_INFO_TOKEN"]
            )
-            file_path = os.path.join(os.getcwd(), "test_failure_tables/module_failures_report.txt")
+
-            with open(file_path, "w", encoding="UTF-8") as fp:
+            # The last run doesn't produce `test_failure_tables` (by some issues or have no model failure at all)
-                fp.write(module_failures_report)
+            if len(prev_tables) > 0:
                # Compute the difference of the previous/current (model failure) table
                prev_model_failures = prev_tables["test_failure_tables"]["model_failures_report.txt"]
                entries_changed = self.compute_diff_for_failure_reports(model_failures_report, prev_model_failures)
                if len(entries_changed) > 0:
                    # Save the complete difference
                    diff_report = prepare_reports(
                        title="Changed model modules failures",
                        header=model_header,
                        reports=entries_changed,
                        to_truncate=False,
                    )
                    file_path = os.path.join(os.getcwd(), "test_failure_tables/changed_model_failures_report.txt")
                    with open(file_path, "w", encoding="UTF-8") as fp:
                        fp.write(diff_report)
                    # To be sent to Slack channels
                    diff_report = prepare_reports(
                        title="*Changed model modules failures*",
                        header=model_header,
                        reports=entries_changed,
                    )
                    model_failure_sections.append(
                        {"type": "section", "text": {"type": "mrkdwn", "text": diff_report}},
                    )
        return model_failure_sections
@@ -487,14 +567,15 @@ class Message:
        )
    def post(self):
        payload = self.payload
        print("Sending the following payload")
-        print(json.dumps({"blocks": json.loads(self.payload)}))
+        print(json.dumps({"blocks": json.loads(payload)}))
        text = f"{self.n_failures} failures out of {self.n_tests} tests," if self.n_failures else "All tests passed."
        self.thread_ts = client.chat_postMessage(
            channel=os.environ["CI_SLACK_REPORT_CHANNEL_ID"],
-            blocks=self.payload,
+            blocks=payload,
            text=text,
        )
@@ -748,6 +829,9 @@ if __name__ == "__main__":
        else:
            ci_title = f"<{ci_url}|{ci_title}>\nAuthor: {ci_author} | Merged by: {merged_by}"
    elif ci_sha:
        ci_title = f"<{ci_url}|commit: {ci_sha}>"
    else:
        ci_title = ""