From feec294dea831568068ea90f729b2be3a253b024 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Tue, 20 May 2025 19:34:58 +0200 Subject: [PATCH] CI reporting improvements (#38230) update Co-authored-by: ydshieh --- .../workflows/check_failed_model_tests.yml | 51 +++- .github/workflows/self-scheduled-caller.yml | 35 +++ .github/workflows/slack-report.yml | 18 +- utils/check_bad_commit.py | 3 +- utils/get_previous_daily_ci.py | 86 ++++-- utils/notification_service.py | 273 ++++++++++-------- utils/notification_service_quantization.py | 42 ++- utils/process_bad_commit_report.py | 22 +- 8 files changed, 375 insertions(+), 155 deletions(-) diff --git a/.github/workflows/check_failed_model_tests.yml b/.github/workflows/check_failed_model_tests.yml index 8366707845..653b50e4cf 100644 --- a/.github/workflows/check_failed_model_tests.yml +++ b/.github/workflows/check_failed_model_tests.yml @@ -39,55 +39,100 @@ jobs: name: ci_results_run_models_gpu path: /transformers/ci_results_run_models_gpu + - name: Check file + working-directory: /transformers + run: | + if [ -f ci_results_run_models_gpu/new_model_failures.json ]; then + echo "`ci_results_run_models_gpu/new_model_failures.json` exists, continue ..." + echo "process=true" >> $GITHUB_ENV + else + echo "`ci_results_run_models_gpu/new_model_failures.json` doesn't exist, abort." + echo "process=false" >> $GITHUB_ENV + fi + + - uses: actions/download-artifact@v4 + if: ${{ env.process == 'true' }} + with: + pattern: setup_values* + path: setup_values + merge-multiple: true + + - name: Prepare some setup values + if: ${{ env.process == 'true' }} + run: | + if [ -f setup_values/prev_workflow_run_id.txt ]; then + echo "PREV_WORKFLOW_RUN_ID=$(cat setup_values/prev_workflow_run_id.txt)" >> $GITHUB_ENV + else + echo "PREV_WORKFLOW_RUN_ID=" >> $GITHUB_ENV + fi + + if [ -f setup_values/other_workflow_run_id.txt ]; then + echo "OTHER_WORKFLOW_RUN_ID=$(cat setup_values/other_workflow_run_id.txt)" >> $GITHUB_ENV + else + echo "OTHER_WORKFLOW_RUN_ID=" >> $GITHUB_ENV + fi + - name: Update clone working-directory: /transformers + if: ${{ env.process == 'true' }} run: git fetch && git checkout ${{ github.sha }} - name: Get target commit working-directory: /transformers/utils + if: ${{ env.process == 'true' }} run: | - echo "END_SHA=$(TOKEN=${{ secrets.ACCESS_REPO_INFO_TOKEN }} python3 -c 'import os; from get_previous_daily_ci import get_last_daily_ci_run_commit; commit=get_last_daily_ci_run_commit(token=os.environ["TOKEN"]); print(commit)')" >> $GITHUB_ENV + echo "END_SHA=$(TOKEN=${{ secrets.ACCESS_REPO_INFO_TOKEN }} python3 -c 'import os; from get_previous_daily_ci import get_last_daily_ci_run_commit; commit=get_last_daily_ci_run_commit(token=os.environ["TOKEN"], workflow_run_id=os.environ["PREV_WORKFLOW_RUN_ID"]); print(commit)')" >> $GITHUB_ENV - name: Checkout to `start_sha` working-directory: /transformers + if: ${{ env.process == 'true' }} run: git fetch && git checkout ${{ inputs.start_sha }} - name: Reinstall transformers in edit mode (remove the one installed during docker image build) working-directory: /transformers + if: ${{ env.process == 'true' }} run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - name: NVIDIA-SMI + if: ${{ env.process == 'true' }} run: | nvidia-smi - name: Environment working-directory: /transformers + if: ${{ env.process == 'true' }} run: | python3 utils/print_env.py - name: Show installed libraries and their versions working-directory: /transformers + if: ${{ env.process == 'true' }} run: pip freeze - name: Check failed tests working-directory: /transformers + if: ${{ env.process == 'true' }} run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_run_models_gpu/new_model_failures.json --output_file new_model_failures_with_bad_commit.json - name: Show results working-directory: /transformers + if: ${{ env.process == 'true' }} run: | ls -l new_model_failures_with_bad_commit.json cat new_model_failures_with_bad_commit.json - name: Checkout back working-directory: /transformers + if: ${{ env.process == 'true' }} run: | git checkout ${{ inputs.start_sha }} - name: Process report shell: bash working-directory: /transformers + if: ${{ env.process == 'true' }} env: + ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }} run: | python3 utils/process_bad_commit_report.py @@ -95,7 +140,9 @@ jobs: - name: Process report shell: bash working-directory: /transformers + if: ${{ env.process == 'true' }} env: + ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }} run: | { @@ -105,7 +152,7 @@ jobs: } >> "$GITHUB_ENV" - name: Send processed report - if: ${{ !endsWith(env.REPORT_TEXT, '{}') }} + if: ${{ env.process == 'true' && !endsWith(env.REPORT_TEXT, '{}') }} uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001 with: # Slack channel id, channel name, or user id to post message. diff --git a/.github/workflows/self-scheduled-caller.yml b/.github/workflows/self-scheduled-caller.yml index 8589f4a810..77b33850fe 100644 --- a/.github/workflows/self-scheduled-caller.yml +++ b/.github/workflows/self-scheduled-caller.yml @@ -8,8 +8,43 @@ on: push: branches: - run_scheduled_ci* + workflow_dispatch: + inputs: + prev_workflow_run_id: + description: 'previous workflow run id to compare' + type: string + required: false + default: "" + other_workflow_run_id: + description: 'other workflow run id to compare' + type: string + required: false + default: "" + + +# Used for `push` to easily modiffy the target workflow runs to compare against +env: + prev_workflow_run_id: "" + other_workflow_run_id: "" + jobs: + setup: + name: Setup + runs-on: ubuntu-22.04 + steps: + - name: Setup + run: | + mkdir "setup_values" + echo "${{ inputs.prev_workflow_run_id || env.prev_workflow_run_id }}" > "setup_values/prev_workflow_run_id.txt" + echo "${{ inputs.other_workflow_run_id || env.other_workflow_run_id }}" > "setup_values/other_workflow_run_id.txt" + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: setup_values + path: setup_values + model-ci: name: Model CI uses: ./.github/workflows/self-scheduled.yml diff --git a/.github/workflows/slack-report.yml b/.github/workflows/slack-report.yml index cbea37ff56..bea113ca03 100644 --- a/.github/workflows/slack-report.yml +++ b/.github/workflows/slack-report.yml @@ -39,6 +39,21 @@ jobs: - uses: actions/checkout@v4 - uses: actions/download-artifact@v4 + + - name: Prepare some setup values + run: | + if [ -f setup_values/prev_workflow_run_id.txt ]; then + echo "PREV_WORKFLOW_RUN_ID=$(cat setup_values/prev_workflow_run_id.txt)" >> $GITHUB_ENV + else + echo "PREV_WORKFLOW_RUN_ID=" >> $GITHUB_ENV + fi + + if [ -f setup_values/other_workflow_run_id.txt ]; then + echo "OTHER_WORKFLOW_RUN_ID=$(cat setup_values/other_workflow_run_id.txt)" >> $GITHUB_ENV + else + echo "OTHER_WORKFLOW_RUN_ID=" >> $GITHUB_ENV + fi + - name: Send message to Slack if: ${{ inputs.job != 'run_quantization_torch_gpu' }} env: @@ -50,7 +65,6 @@ jobs: ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} CI_EVENT: ${{ inputs.ci_event }} CI_SHA: ${{ github.sha }} - CI_WORKFLOW_REF: ${{ github.workflow_ref }} CI_TEST_JOB: ${{ inputs.job }} SETUP_STATUS: ${{ inputs.setup_status }} # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change @@ -58,7 +72,6 @@ jobs: # For a job that doesn't depend on (i.e. `needs`) `setup`, the value for `inputs.folder_slices` would be an # empty string, and the called script still get one argument (which is the emtpy string). run: | - sudo apt-get install -y curl pip install huggingface_hub pip install slack_sdk pip show slack_sdk @@ -86,7 +99,6 @@ jobs: # We pass `needs.setup.outputs.quantization_matrix` as the argument. A processing in `notification_service_quantization.py` to change # `quantization/bnb` to `quantization_bnb` is required, as the artifact names use `_` instead of `/`. run: | - sudo apt-get install -y curl pip install huggingface_hub pip install slack_sdk pip show slack_sdk diff --git a/utils/check_bad_commit.py b/utils/check_bad_commit.py index 9926111f72..5d21b1c465 100644 --- a/utils/check_bad_commit.py +++ b/utils/check_bad_commit.py @@ -144,7 +144,8 @@ def get_commit_info(commit): url = f"https://api.github.com/repos/huggingface/transformers/pulls/{pr_number}" pr_for_commit = requests.get(url).json() author = pr_for_commit["user"]["login"] - merged_author = pr_for_commit["merged_by"]["login"] + if pr_for_commit["merged_by"] is not None: + merged_author = pr_for_commit["merged_by"]["login"] if author is None: url = f"https://api.github.com/repos/huggingface/transformers/commits/{commit}" diff --git a/utils/get_previous_daily_ci.py b/utils/get_previous_daily_ci.py index efd7d24a75..c9248facf9 100644 --- a/utils/get_previous_daily_ci.py +++ b/utils/get_previous_daily_ci.py @@ -5,7 +5,7 @@ import requests from get_ci_error_statistics import download_artifact, get_artifacts_links -def get_daily_ci_runs(token, num_runs=7): +def get_daily_ci_runs(token, num_runs=7, workflow_id=None): """Get the workflow runs of the scheduled (daily) CI. This only selects the runs triggered by the `schedule` event on the `main` branch. @@ -18,7 +18,13 @@ def get_daily_ci_runs(token, num_runs=7): # From a given workflow run (where we have workflow run id), we can get the workflow id by going to # https://api.github.com/repos/huggingface/transformers/actions/runs/{workflow_run_id} # and check the `workflow_id` key. - workflow_id = "90575235" + + if not workflow_id: + workflow_run_id = os.environ["GITHUB_RUN_ID"] + workflow_run = requests.get( + f"https://api.github.com/repos/huggingface/transformers/actions/runs/{workflow_run_id}", headers=headers + ).json() + workflow_id = workflow_run["workflow_id"] url = f"https://api.github.com/repos/huggingface/transformers/actions/workflows/{workflow_id}/runs" # On `main` branch + event being `schedule` + not returning PRs + only `num_runs` results @@ -29,33 +35,64 @@ def get_daily_ci_runs(token, num_runs=7): return result["workflow_runs"] -def get_last_daily_ci_runs(token): +def get_last_daily_ci_run(token, workflow_run_id=None, workflow_id=None, commit_sha=None): """Get the last completed workflow run id of the scheduled (daily) CI.""" - workflow_runs = get_daily_ci_runs(token) - workflow_run_id = None - for workflow_run in workflow_runs: - if workflow_run["status"] == "completed": - workflow_run_id = workflow_run["id"] + headers = None + if token is not None: + headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"} + + workflow_run = None + if workflow_run_id is not None and workflow_run_id != "": + workflow_run = requests.get( + f"https://api.github.com/repos/huggingface/transformers/actions/runs/{workflow_run_id}", headers=headers + ).json() + return workflow_run + + workflow_runs = get_daily_ci_runs(token, workflow_id=workflow_id) + for run in workflow_runs: + if commit_sha in [None, ""] and run["status"] == "completed": + workflow_run = run break + # if `commit_sha` is specified, and `workflow_run["head_sha"]` matches it, return it. + elif commit_sha not in [None, ""] and run["head_sha"] == commit_sha: + workflow_run = run + break + + return workflow_run + + +def get_last_daily_ci_workflow_run_id(token, workflow_run_id=None, workflow_id=None, commit_sha=None): + """Get the last completed workflow run id of the scheduled (daily) CI.""" + if workflow_run_id is not None and workflow_run_id != "": + return workflow_run_id + + workflow_run = get_last_daily_ci_run(token, workflow_id=workflow_id, commit_sha=commit_sha) + workflow_run_id = None + if workflow_run is not None: + workflow_run_id = workflow_run["id"] return workflow_run_id -def get_last_daily_ci_run_commit(token): +def get_last_daily_ci_run_commit(token, workflow_run_id=None, workflow_id=None, commit_sha=None): """Get the commit sha of the last completed scheduled daily CI workflow run.""" - workflow_runs = get_daily_ci_runs(token) - head_sha = None - for workflow_run in workflow_runs: - if workflow_run["status"] == "completed": - head_sha = workflow_run["head_sha"] - break + workflow_run = get_last_daily_ci_run( + token, workflow_run_id=workflow_run_id, workflow_id=workflow_id, commit_sha=commit_sha + ) + workflow_run_head_sha = None + if workflow_run is not None: + workflow_run_head_sha = workflow_run["head_sha"] - return head_sha + return workflow_run_head_sha -def get_last_daily_ci_artifacts(artifact_names, output_dir, token): +def get_last_daily_ci_artifacts( + artifact_names, output_dir, token, workflow_run_id=None, workflow_id=None, commit_sha=None +): """Get the artifacts of last completed workflow run id of the scheduled (daily) CI.""" - workflow_run_id = get_last_daily_ci_runs(token) + workflow_run_id = get_last_daily_ci_workflow_run_id( + token, workflow_run_id=workflow_run_id, workflow_id=workflow_id, commit_sha=commit_sha + ) if workflow_run_id is not None: artifacts_links = get_artifacts_links(worflow_run_id=workflow_run_id, token=token) for artifact_name in artifact_names: @@ -66,9 +103,18 @@ def get_last_daily_ci_artifacts(artifact_names, output_dir, token): ) -def get_last_daily_ci_reports(artifact_names, output_dir, token): +def get_last_daily_ci_reports( + artifact_names, output_dir, token, workflow_run_id=None, workflow_id=None, commit_sha=None +): """Get the artifacts' content of the last completed workflow run id of the scheduled (daily) CI.""" - get_last_daily_ci_artifacts(artifact_names, output_dir, token) + get_last_daily_ci_artifacts( + artifact_names, + output_dir, + token, + workflow_run_id=workflow_run_id, + workflow_id=workflow_id, + commit_sha=commit_sha, + ) results = {} for artifact_name in artifact_names: diff --git a/utils/notification_service.py b/utils/notification_service.py index 399c792c9d..407ee47e59 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -14,7 +14,6 @@ import ast import collections -import datetime import functools import json import operator @@ -26,7 +25,7 @@ from typing import Any, Dict, List, Optional, Union import requests from get_ci_error_statistics import get_jobs -from get_previous_daily_ci import get_last_daily_ci_reports +from get_previous_daily_ci import get_last_daily_ci_reports, get_last_daily_ci_run, get_last_daily_ci_workflow_run_id from huggingface_hub import HfApi from slack_sdk import WebClient @@ -109,6 +108,7 @@ class Message: additional_results: Dict, selected_warnings: Optional[List] = None, prev_ci_artifacts=None, + other_ci_artifacts=None, ): self.title = title self.ci_title = ci_title @@ -159,6 +159,7 @@ class Message: self.selected_warnings = selected_warnings self.prev_ci_artifacts = prev_ci_artifacts + self.other_ci_artifacts = other_ci_artifacts @property def time(self) -> str: @@ -515,71 +516,83 @@ class Message: if len(self.selected_warnings) > 0: blocks.append(self.warnings) - new_failure_blocks = self.get_new_model_failure_blocks(with_header=False) - if len(new_failure_blocks) > 0: - blocks.extend(new_failure_blocks) + for idx, (prev_workflow_run_id, prev_ci_artifacts) in enumerate( + [self.prev_ci_artifacts] + self.other_ci_artifacts + ): + if idx == 0: + # This is the truncated version to show on slack. For now. + new_failure_blocks = self.get_new_model_failure_blocks( + prev_ci_artifacts=prev_ci_artifacts, with_header=False + ) + if len(new_failure_blocks) > 0: + blocks.extend(new_failure_blocks) - # To save the list of new model failures - extra_blocks = self.get_new_model_failure_blocks(to_truncate=False) - if extra_blocks: - failure_text = extra_blocks[-1]["text"]["text"] - file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/new_model_failures.txt") - with open(file_path, "w", encoding="UTF-8") as fp: - fp.write(failure_text) + # To save the list of new model failures and uploaed to hub repositories + extra_blocks = self.get_new_model_failure_blocks(prev_ci_artifacts=prev_ci_artifacts, to_truncate=False) + if extra_blocks: + filename = "new_model_failures" + if idx > 0: + filename = f"{filename}_against_{prev_workflow_run_id}" - # upload results to Hub dataset - file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/new_model_failures.txt") - commit_info = api.upload_file( - path_or_fileobj=file_path, - path_in_repo=f"{datetime.datetime.today().strftime('%Y-%m-%d')}/ci_results_{job_name}/new_model_failures.txt", - repo_id="hf-internal-testing/transformers_daily_ci", - repo_type="dataset", - token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None), - ) - url = f"https://huggingface.co/datasets/hf-internal-testing/transformers_daily_ci/raw/{commit_info.oid}/{datetime.datetime.today().strftime('%Y-%m-%d')}/ci_results_{job_name}/new_model_failures.txt" + failure_text = extra_blocks[-1]["text"]["text"] + file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/{filename}.txt") + with open(file_path, "w", encoding="UTF-8") as fp: + fp.write(failure_text) - # extra processing to save to json format - new_failed_tests = {} - for line in failure_text.split(): - if "https://github.com/huggingface/transformers/actions/runs" in line: - pattern = r"<(https://github.com/huggingface/transformers/actions/runs/.+?/job/.+?)\|(.+?)>" - items = re.findall(pattern, line) - elif "tests/" in line: - if "tests/models/" in line: - model = line.split("/")[2] - else: - model = line.split("/")[1] - if model not in new_failed_tests: - new_failed_tests[model] = {"single-gpu": [], "multi-gpu": []} - for url, device in items: - new_failed_tests[model][f"{device}-gpu"].append(line) - file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/new_model_failures.json") - with open(file_path, "w", encoding="UTF-8") as fp: - json.dump(new_failed_tests, fp, ensure_ascii=False, indent=4) + # upload results to Hub dataset + file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/{filename}.txt") + commit_info = api.upload_file( + path_or_fileobj=file_path, + path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/{filename}.txt", + repo_id="hf-internal-testing/transformers_daily_ci", + repo_type="dataset", + token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None), + ) + url = f"https://huggingface.co/datasets/hf-internal-testing/transformers_daily_ci/raw/{commit_info.oid}/{report_repo_folder}/ci_results_{job_name}/{filename}.txt" - # upload results to Hub dataset - file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/new_model_failures.json") - _ = api.upload_file( - path_or_fileobj=file_path, - path_in_repo=f"{datetime.datetime.today().strftime('%Y-%m-%d')}/ci_results_{job_name}/new_model_failures.json", - repo_id="hf-internal-testing/transformers_daily_ci", - repo_type="dataset", - token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None), - ) + # extra processing to save to json format + new_failed_tests = {} + for line in failure_text.split(): + if "https://github.com/huggingface/transformers/actions/runs" in line: + pattern = r"<(https://github.com/huggingface/transformers/actions/runs/.+?/job/.+?)\|(.+?)>" + items = re.findall(pattern, line) + elif "tests/" in line: + if "tests/models/" in line: + model = line.split("/")[2] + else: + model = line.split("/")[1] + if model not in new_failed_tests: + new_failed_tests[model] = {"single-gpu": [], "multi-gpu": []} + for url, device in items: + new_failed_tests[model][f"{device}-gpu"].append(line) + file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/{filename}.json") + with open(file_path, "w", encoding="UTF-8") as fp: + json.dump(new_failed_tests, fp, ensure_ascii=False, indent=4) - block = { - "type": "section", - "text": { - "type": "plain_text", - "text": " ", - }, - "accessory": { - "type": "button", - "text": {"type": "plain_text", "text": "Check New model failures"}, - "url": url, - }, - } - blocks.append(block) + # upload results to Hub dataset + file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/{filename}.json") + _ = api.upload_file( + path_or_fileobj=file_path, + path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/{filename}.json", + repo_id="hf-internal-testing/transformers_daily_ci", + repo_type="dataset", + token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None), + ) + + if idx == 0: + block = { + "type": "section", + "text": { + "type": "plain_text", + "text": " ", + }, + "accessory": { + "type": "button", + "text": {"type": "plain_text", "text": "Check New model failures"}, + "url": url, + }, + } + blocks.append(block) return json.dumps(blocks) @@ -700,18 +713,18 @@ class Message: {"type": "section", "text": {"type": "mrkdwn", "text": failure_text}}, ] - def get_new_model_failure_blocks(self, with_header=True, to_truncate=True): - if self.prev_ci_artifacts is None: + def get_new_model_failure_blocks(self, prev_ci_artifacts, with_header=True, to_truncate=True): + if prev_ci_artifacts is None: return [] sorted_dict = sorted(self.model_results.items(), key=lambda t: t[0]) prev_model_results = {} if ( - f"ci_results_{job_name}" in self.prev_ci_artifacts - and "model_results.json" in self.prev_ci_artifacts[f"ci_results_{job_name}"] + f"ci_results_{job_name}" in prev_ci_artifacts + and "model_results.json" in prev_ci_artifacts[f"ci_results_{job_name}"] ): - prev_model_results = json.loads(self.prev_ci_artifacts[f"ci_results_{job_name}"]["model_results.json"]) + prev_model_results = json.loads(prev_ci_artifacts[f"ci_results_{job_name}"]["model_results.json"]) all_failure_lines = {} for job, job_result in sorted_dict: @@ -812,20 +825,6 @@ class Message: time.sleep(1) - blocks = self.get_new_model_failure_blocks() - if blocks: - print("Sending the following reply") - print(json.dumps({"blocks": blocks})) - - client.chat_postMessage( - channel=SLACK_REPORT_CHANNEL_ID, - text="Results for new failures", - blocks=blocks, - thread_ts=self.thread_ts["ts"], - ) - - time.sleep(1) - def retrieve_artifact(artifact_path: str, gpu: Optional[str]): if gpu not in [None, "single", "multi"]: @@ -1168,6 +1167,23 @@ if __name__ == "__main__": "run_torch_cuda_extensions_gpu": "DeepSpeed", } + # if it is not a scheduled run, upload the reports to a subfolder under `report_repo_folder` + report_repo_subfolder = "" + if os.getenv("GITHUB_EVENT_NAME") != "schedule": + report_repo_subfolder = f"{os.getenv('GITHUB_RUN_NUMBER')}-{os.getenv('GITHUB_RUN_ID')}" + report_repo_subfolder = f"runs/{report_repo_subfolder}" + + workflow_run = get_last_daily_ci_run( + token=os.environ["ACCESS_REPO_INFO_TOKEN"], workflow_run_id=os.getenv("GITHUB_RUN_ID") + ) + workflow_run_created_time = workflow_run["created_at"] + workflow_id = workflow_run["workflow_id"] + + report_repo_folder = workflow_run_created_time.split("T")[0] + + if report_repo_subfolder: + report_repo_folder = f"{report_repo_folder}/{report_repo_subfolder}" + # Remove some entries in `additional_files` if they are not concerned. test_name = None if job_name in job_to_test_map: @@ -1241,8 +1257,9 @@ if __name__ == "__main__": if not os.path.isdir(os.path.join(os.getcwd(), f"ci_results_{job_name}")): os.makedirs(os.path.join(os.getcwd(), f"ci_results_{job_name}")) - target_workflow = "huggingface/transformers/.github/workflows/self-scheduled-caller.yml@refs/heads/main" - is_scheduled_ci_run = os.environ.get("CI_WORKFLOW_REF") == target_workflow + nvidia_daily_ci_workflow = "huggingface/transformers/.github/workflows/self-scheduled-caller.yml" + is_nvidia_daily_ci_workflow = os.environ.get("GITHUB_WORKFLOW_REF").startswith(nvidia_daily_ci_workflow) + is_scheduled_ci_run = os.environ.get("GITHUB_EVENT_NAME") == "schedule" # Only the model testing job is concerned: this condition is to avoid other jobs to upload the empty list as # results. @@ -1250,15 +1267,13 @@ if __name__ == "__main__": with open(f"ci_results_{job_name}/model_results.json", "w", encoding="UTF-8") as fp: json.dump(model_results, fp, indent=4, ensure_ascii=False) - # upload results to Hub dataset (only for the scheduled daily CI run on `main`) - if is_scheduled_ci_run: - api.upload_file( - path_or_fileobj=f"ci_results_{job_name}/model_results.json", - path_in_repo=f"{datetime.datetime.today().strftime('%Y-%m-%d')}/ci_results_{job_name}/model_results.json", - repo_id="hf-internal-testing/transformers_daily_ci", - repo_type="dataset", - token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None), - ) + api.upload_file( + path_or_fileobj=f"ci_results_{job_name}/model_results.json", + path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/model_results.json", + repo_id="hf-internal-testing/transformers_daily_ci", + repo_type="dataset", + token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None), + ) # Let's create a file contain job --> job link model_job_links = {} @@ -1272,15 +1287,13 @@ if __name__ == "__main__": with open(f"ci_results_{job_name}/model_job_links.json", "w", encoding="UTF-8") as fp: json.dump(model_job_links, fp, indent=4, ensure_ascii=False) - # upload results to Hub dataset (only for the scheduled daily CI run on `main`) - if is_scheduled_ci_run: - api.upload_file( - path_or_fileobj=f"ci_results_{job_name}/model_job_links.json", - path_in_repo=f"{datetime.datetime.today().strftime('%Y-%m-%d')}/ci_results_{job_name}/model_job_links.json", - repo_id="hf-internal-testing/transformers_daily_ci", - repo_type="dataset", - token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None), - ) + api.upload_file( + path_or_fileobj=f"ci_results_{job_name}/model_job_links.json", + path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/model_job_links.json", + repo_id="hf-internal-testing/transformers_daily_ci", + repo_type="dataset", + token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None), + ) # Must have the same keys as in `additional_results`. # The values are used as the file names where to save the corresponding CI job results. @@ -1294,26 +1307,57 @@ if __name__ == "__main__": with open(f"ci_results_{job_name}/{test_to_result_name[job]}_results.json", "w", encoding="UTF-8") as fp: json.dump(job_result, fp, indent=4, ensure_ascii=False) - # upload results to Hub dataset (only for the scheduled daily CI run on `main`) - if is_scheduled_ci_run: - api.upload_file( - path_or_fileobj=f"ci_results_{job_name}/{test_to_result_name[job]}_results.json", - path_in_repo=f"{datetime.datetime.today().strftime('%Y-%m-%d')}/ci_results_{job_name}/{test_to_result_name[job]}_results.json", - repo_id="hf-internal-testing/transformers_daily_ci", - repo_type="dataset", - token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None), - ) + api.upload_file( + path_or_fileobj=f"ci_results_{job_name}/{test_to_result_name[job]}_results.json", + path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/{test_to_result_name[job]}_results.json", + repo_id="hf-internal-testing/transformers_daily_ci", + repo_type="dataset", + token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None), + ) + + prev_workflow_run_id = None + other_workflow_run_ids = [] - prev_ci_artifacts = None if is_scheduled_ci_run: + # TODO: remove `if job_name == "run_models_gpu"` if job_name == "run_models_gpu": - # Get the last previously completed CI's failure tables + prev_workflow_run_id = get_last_daily_ci_workflow_run_id( + token=os.environ["ACCESS_REPO_INFO_TOKEN"], workflow_id=workflow_id + ) + # For a scheduled run that is not the Nvidia's scheduled daily CI, add Nvidia's scheduled daily CI run as a target to compare. + if not is_nvidia_daily_ci_workflow: + # The id of the workflow `.github/workflows/self-scheduled-caller.yml` (not of a workflow run of it). + other_workflow_id = "90575235" + # We need to get the Nvidia's scheduled daily CI run that match the current run (i.e. run with the same commit SHA) + other_workflow_run_id = get_last_daily_ci_workflow_run_id( + token=os.environ["ACCESS_REPO_INFO_TOKEN"], workflow_id=other_workflow_id, commit_sha=ci_sha + ) + other_workflow_run_ids.append(other_workflow_run_id) + else: + prev_workflow_run_id = os.environ["PREV_WORKFLOW_RUN_ID"] + other_workflow_run_id = os.environ["OTHER_WORKFLOW_RUN_ID"] + other_workflow_run_ids.append(other_workflow_run_id) + + prev_ci_artifacts = (None, None) + other_ci_artifacts = [] + + for idx, target_workflow_run_id in enumerate([prev_workflow_run_id] + other_workflow_run_ids): + if target_workflow_run_id is None or target_workflow_run_id == "": + continue + else: artifact_names = [f"ci_results_{job_name}"] output_dir = os.path.join(os.getcwd(), "previous_reports") os.makedirs(output_dir, exist_ok=True) - prev_ci_artifacts = get_last_daily_ci_reports( - artifact_names=artifact_names, output_dir=output_dir, token=os.environ["ACCESS_REPO_INFO_TOKEN"] + ci_artifacts = get_last_daily_ci_reports( + artifact_names=artifact_names, + output_dir=output_dir, + token=os.environ["ACCESS_REPO_INFO_TOKEN"], + workflow_run_id=target_workflow_run_id, ) + if idx == 0: + prev_ci_artifacts = (target_workflow_run_id, ci_artifacts) + else: + other_ci_artifacts.append((target_workflow_run_id, ci_artifacts)) job_to_test_map.update( { @@ -1335,6 +1379,7 @@ if __name__ == "__main__": additional_results, selected_warnings=selected_warnings, prev_ci_artifacts=prev_ci_artifacts, + other_ci_artifacts=other_ci_artifacts, ) # send report only if there is any failure (for push CI) diff --git a/utils/notification_service_quantization.py b/utils/notification_service_quantization.py index f15aa68f90..dc9678c781 100644 --- a/utils/notification_service_quantization.py +++ b/utils/notification_service_quantization.py @@ -13,7 +13,6 @@ # limitations under the License. import ast -import datetime import json import os import sys @@ -21,6 +20,7 @@ import time from typing import Dict from get_ci_error_statistics import get_jobs +from get_previous_daily_ci import get_last_daily_ci_run from huggingface_hub import HfApi from notification_service import ( Message, @@ -246,24 +246,42 @@ if __name__ == "__main__": ) job_name = os.getenv("CI_TEST_JOB") + + # if it is not a scheduled run, upload the reports to a subfolder under `report_repo_folder` + report_repo_subfolder = "" + if os.getenv("GITHUB_EVENT_NAME") != "schedule": + report_repo_subfolder = f"{os.getenv('GITHUB_RUN_NUMBER')}-{os.getenv('GITHUB_RUN_ID')}" + report_repo_subfolder = f"runs/{report_repo_subfolder}" + + workflow_run = get_last_daily_ci_run( + token=os.environ["ACCESS_REPO_INFO_TOKEN"], workflow_run_id=os.getenv("GITHUB_RUN_ID") + ) + workflow_run_created_time = workflow_run["created_at"] + workflow_id = workflow_run["workflow_id"] + + report_repo_folder = workflow_run_created_time.split("T")[0] + + if report_repo_subfolder: + report_repo_folder = f"{report_repo_folder}/{report_repo_subfolder}" + if not os.path.isdir(os.path.join(os.getcwd(), f"ci_results_{job_name}")): os.makedirs(os.path.join(os.getcwd(), f"ci_results_{job_name}")) + nvidia_daily_ci_workflow = "huggingface/transformers/.github/workflows/self-scheduled-caller.yml" + is_nvidia_daily_ci_workflow = os.environ.get("GITHUB_WORKFLOW_REF").startswith(nvidia_daily_ci_workflow) + is_scheduled_ci_run = os.environ.get("GITHUB_EVENT_NAME") == "schedule" + with open(f"ci_results_{job_name}/quantization_results.json", "w", encoding="UTF-8") as fp: json.dump(quantization_results, fp, indent=4, ensure_ascii=False) - target_workflow = "huggingface/transformers/.github/workflows/self-scheduled-caller.yml@refs/heads/main" - is_scheduled_ci_run = os.environ.get("CI_WORKFLOW_REF") == target_workflow - # upload results to Hub dataset (only for the scheduled daily CI run on `main`) - if is_scheduled_ci_run: - api.upload_file( - path_or_fileobj=f"ci_results_{job_name}/quantization_results.json", - path_in_repo=f"{datetime.datetime.today().strftime('%Y-%m-%d')}/ci_results_{job_name}/quantization_results.json", - repo_id="hf-internal-testing/transformers_daily_ci", - repo_type="dataset", - token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None), - ) + api.upload_file( + path_or_fileobj=f"ci_results_{job_name}/quantization_results.json", + path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/quantization_results.json", + repo_id="hf-internal-testing/transformers_daily_ci", + repo_type="dataset", + token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None), + ) message = QuantizationMessage( title, diff --git a/utils/process_bad_commit_report.py b/utils/process_bad_commit_report.py index bba03b4bd1..50c338b633 100644 --- a/utils/process_bad_commit_report.py +++ b/utils/process_bad_commit_report.py @@ -12,12 +12,12 @@ This is used by `.github/workflows/check_failed_model_tests.yml` to produce a sl ``` """ -import datetime import json import os from collections import Counter from copy import deepcopy +from get_previous_daily_ci import get_last_daily_ci_run from huggingface_hub import HfApi @@ -76,16 +76,32 @@ if __name__ == "__main__": new_data_full[author] = {k: v for k, v in _data.items() if len(v) > 0} # Upload to Hub and get the url + # if it is not a scheduled run, upload the reports to a subfolder under `report_repo_folder` + report_repo_subfolder = "" + if os.getenv("GITHUB_EVENT_NAME") != "schedule": + report_repo_subfolder = f"{os.getenv('GITHUB_RUN_NUMBER')}-{os.getenv('GITHUB_RUN_ID')}" + report_repo_subfolder = f"runs/{report_repo_subfolder}" + + workflow_run = get_last_daily_ci_run( + token=os.environ["ACCESS_REPO_INFO_TOKEN"], workflow_run_id=os.getenv("GITHUB_RUN_ID") + ) + workflow_run_created_time = workflow_run["created_at"] + + report_repo_folder = workflow_run_created_time.split("T")[0] + + if report_repo_subfolder: + report_repo_folder = f"{report_repo_folder}/{report_repo_subfolder}" + with open("new_model_failures_with_bad_commit_grouped_by_authors.json", "w") as fp: json.dump(new_data_full, fp, ensure_ascii=False, indent=4) commit_info = api.upload_file( path_or_fileobj="new_model_failures_with_bad_commit_grouped_by_authors.json", - path_in_repo=f"{datetime.datetime.today().strftime('%Y-%m-%d')}/ci_results_run_models_gpu/new_model_failures_with_bad_commit_grouped_by_authors.json", + path_in_repo=f"{report_repo_folder}/ci_results_run_models_gpu/new_model_failures_with_bad_commit_grouped_by_authors.json", repo_id="hf-internal-testing/transformers_daily_ci", repo_type="dataset", token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None), ) - url = f"https://huggingface.co/datasets/hf-internal-testing/transformers_daily_ci/raw/{commit_info.oid}/{datetime.datetime.today().strftime('%Y-%m-%d')}/ci_results_run_models_gpu/new_model_failures_with_bad_commit_grouped_by_authors.json" + url = f"https://huggingface.co/datasets/hf-internal-testing/transformers_daily_ci/raw/{commit_info.oid}/{report_repo_folder}/ci_results_run_models_gpu/new_model_failures_with_bad_commit_grouped_by_authors.json" # Add `GH_` prefix as keyword mention output = {}