From 28247e78819ab9756b81f8df39611c333d099400 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 28 Nov 2022 21:14:33 +0100
Subject: [PATCH] Extract warnings from CI artifacts (#20474)

* extract warning from CI artifacts

* fix path

* fix logic

* fix comment

* update default values

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 utils/extract_warnings.py | 113 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 113 insertions(+)
 create mode 100644 utils/extract_warnings.py

diff --git a/utils/extract_warnings.py b/utils/extract_warnings.py
new file mode 100644
index 0000000000..bc795c53f7
--- /dev/null
+++ b/utils/extract_warnings.py
@@ -0,0 +1,113 @@
+import argparse
+import json
+import os
+import time
+import zipfile
+
+from get_ci_error_statistics import download_artifact, get_artifacts_links
+from transformers import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def extract_warnings_from_single_artifact(artifact_zip_path, targets):
+    """Extract warnings from a downloaded artifact (in .zip format)"""
+    selected_warnings = set()
+    buffer = []
+
+    try:
+        with zipfile.ZipFile(artifact_zip_path) as z:
+            for filename in z.namelist():
+                if not os.path.isdir(filename):
+                    # read the file
+                    if filename != "warnings.txt":
+                        continue
+                    with z.open(filename) as f:
+                        for line in f:
+                            line = line.decode("UTF-8")
+                            if "warnings summary (final)" in line:
+                                continue
+                            # This means we are outside the body of a warning
+                            elif not line.startswith(" "):
+                                # process a single warning and move it to `selected_warnings`.
+                                if len(buffer) > 0:
+                                    warning = "\n".join(buffer)
+                                    # Only keep the warnings specified in `targets`
+                                    if any(f": {x}: " in warning for x in targets):
+                                        selected_warnings.add(warning)
+                                    buffer = []
+                                continue
+                            else:
+                                line = line.strip()
+                                buffer.append(line)
+    except Exception:
+        logger.warning(
+            f"{artifact_zip_path} is either an invalid zip file or something else wrong. This file is skipped."
+        )
+
+    return selected_warnings
+
+
+def extract_warnings(artifact_dir, targets):
+    """Extract warnings from all artifact files"""
+
+    selected_warnings = set()
+
+    paths = [os.path.join(artifact_dir, p) for p in os.listdir(artifact_dir) if p.endswith(".zip")]
+    for p in paths:
+        selected_warnings.update(extract_warnings_from_single_artifact(p, targets))
+
+    return selected_warnings
+
+
+if __name__ == "__main__":
+
+    def list_str(values):
+        return values.split(",")
+
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--workflow_run_id", default=None, type=str, required=True, help="A GitHub Actions workflow run id."
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="Where to store the downloaded artifacts and other result files.",
+    )
+    parser.add_argument(
+        "--token", default=None, type=str, required=True, help="A token that has actions:read permission."
+    )
+    parser.add_argument(
+        "--targets",
+        default="DeprecationWarning,UserWarning,FutureWarning",
+        type=list_str,
+        help="Comma-separated list of target warning(s) which we want to extract.",
+    )
+
+    args = parser.parse_args()
+
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # get download links
+    artifacts = get_artifacts_links(args.workflow_run_id)
+    with open(os.path.join(args.output_dir, "artifacts.json"), "w", encoding="UTF-8") as fp:
+        json.dump(artifacts, fp, ensure_ascii=False, indent=4)
+
+    # download artifacts
+    for idx, (name, url) in enumerate(artifacts.items()):
+        print(name)
+        print(url)
+        print("=" * 80)
+        download_artifact(name, url, args.output_dir, args.token)
+        # Be gentle to GitHub
+        time.sleep(1)
+
+    # extract warnings from artifacts
+    selected_warnings = extract_warnings(args.output_dir, args.targets)
+    selected_warnings = sorted(list(selected_warnings))
+    with open(os.path.join(args.output_dir, "selected_warnings.json"), "w", encoding="UTF-8") as fp:
+        json.dump(selected_warnings, fp, ensure_ascii=False, indent=4)