new failure CI reports for all jobs (#38298)

* new failures

* report_repo_id

* report_repo_id

* report_repo_id

* More fixes

* More fixes

* More fixes

* ruff

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
Yih-Dar
2025-05-24 19:15:02 +02:00
committed by GitHub
parent 31f8a0fe8a
commit d0c9c66d1c
11 changed files with 248 additions and 121 deletions

View File

@@ -9,6 +9,18 @@ on:
start_sha:
required: true
type: string
job:
required: true
type: string
slack_report_channel:
required: true
type: string
ci_event:
required: true
type: string
report_repo_id:
required: true
type: string
env:
@@ -26,7 +38,7 @@ env:
jobs:
run_models_gpu:
check_new_failures:
name: " "
runs-on:
group: aws-g4dn-4xlarge-cache
@@ -36,17 +48,17 @@ jobs:
steps:
- uses: actions/download-artifact@v4
with:
name: ci_results_run_models_gpu
path: /transformers/ci_results_run_models_gpu
name: ci_results_${{ inputs.job }}
path: /transformers/ci_results_${{ inputs.job }}
- name: Check file
working-directory: /transformers
run: |
if [ -f ci_results_run_models_gpu/new_model_failures.json ]; then
echo "`ci_results_run_models_gpu/new_model_failures.json` exists, continue ..."
if [ -f ci_results_${{ inputs.job }}/new_failures.json ]; then
echo "`ci_results_${{ inputs.job }}/new_failures.json` exists, continue ..."
echo "process=true" >> $GITHUB_ENV
else
echo "`ci_results_run_models_gpu/new_model_failures.json` doesn't exist, abort."
echo "`ci_results_${{ inputs.job }}/new_failures.json` doesn't exist, abort."
echo "process=false" >> $GITHUB_ENV
fi
@@ -112,14 +124,14 @@ jobs:
- name: Check failed tests
working-directory: /transformers
if: ${{ env.process == 'true' }}
run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_run_models_gpu/new_model_failures.json --output_file new_model_failures_with_bad_commit.json
run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_${{ inputs.job }}/new_failures.json --output_file new_failures_with_bad_commit.json
- name: Show results
working-directory: /transformers
if: ${{ env.process == 'true' }}
run: |
ls -l new_model_failures_with_bad_commit.json
cat new_model_failures_with_bad_commit.json
ls -l new_failures_with_bad_commit.json
cat new_failures_with_bad_commit.json
- name: Checkout back
working-directory: /transformers
@@ -134,6 +146,8 @@ jobs:
env:
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
JOB_NAME: ${{ inputs.job }}
REPORT_REPO_ID: ${{ inputs.report_repo_id }}
run: |
python3 utils/process_bad_commit_report.py
@@ -144,6 +158,8 @@ jobs:
env:
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
JOB_NAME: ${{ inputs.job }}
REPORT_REPO_ID: ${{ inputs.report_repo_id }}
run: |
{
echo 'REPORT_TEXT<<EOF'
@@ -151,17 +167,31 @@ jobs:
echo EOF
} >> "$GITHUB_ENV"
- name: Prepare Slack report title
working-directory: /transformers
if: ${{ env.process == 'true' }}
run: |
pip install slack_sdk
echo "title=$(python3 -c 'import sys; sys.path.append("utils"); from utils.notification_service import job_to_test_map; ci_event = "${{ inputs.ci_event }}"; job = "${{ inputs.job }}"; test_name = job_to_test_map[job]; title = f"New failed tests of {ci_event}" + ":" + f" {test_name}"; print(title)')" >> $GITHUB_ENV
- name: Send processed report
if: ${{ env.process == 'true' && !endsWith(env.REPORT_TEXT, '{}') }}
uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
with:
# Slack channel id, channel name, or user id to post message.
# See also: https://api.slack.com/methods/chat.postMessage#channels
channel-id: '#transformers-ci-feedback-tests'
channel-id: '#${{ inputs.slack_report_channel }}'
# For posting a rich message using Block Kit
payload: |
{
"blocks": [
{
"type": "header",
"text": {
"type": "plain_text",
"text": "${{ env.title }}"
}
},
{
"type": "section",
"text": {

View File

@@ -19,6 +19,7 @@ jobs:
runner: mi210
docker: huggingface/transformers-pytorch-amd-gpu
ci_event: Scheduled CI (AMD) - mi210
report_repo_id: optimum-amd/transformers_daily_ci
secrets: inherit
torch-pipeline:
@@ -30,6 +31,7 @@ jobs:
runner: mi210
docker: huggingface/transformers-pytorch-amd-gpu
ci_event: Scheduled CI (AMD) - mi210
report_repo_id: optimum-amd/transformers_daily_ci
secrets: inherit
example-ci:
@@ -41,6 +43,7 @@ jobs:
runner: mi210
docker: huggingface/transformers-pytorch-amd-gpu
ci_event: Scheduled CI (AMD) - mi210
report_repo_id: optimum-amd/transformers_daily_ci
secrets: inherit
deepspeed-ci:
@@ -52,4 +55,5 @@ jobs:
runner: mi210
docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
ci_event: Scheduled CI (AMD) - mi210
report_repo_id: optimum-amd/transformers_daily_ci
secrets: inherit

View File

@@ -19,6 +19,7 @@ jobs:
runner: mi250
docker: huggingface/transformers-pytorch-amd-gpu
ci_event: Scheduled CI (AMD) - mi250
report_repo_id: optimum-amd/transformers_daily_ci
secrets: inherit
torch-pipeline:
@@ -30,6 +31,7 @@ jobs:
runner: mi250
docker: huggingface/transformers-pytorch-amd-gpu
ci_event: Scheduled CI (AMD) - mi250
report_repo_id: optimum-amd/transformers_daily_ci
secrets: inherit
example-ci:
@@ -41,6 +43,7 @@ jobs:
runner: mi250
docker: huggingface/transformers-pytorch-amd-gpu
ci_event: Scheduled CI (AMD) - mi250
report_repo_id: optimum-amd/transformers_daily_ci
secrets: inherit
deepspeed-ci:
@@ -52,4 +55,5 @@ jobs:
runner: mi250
docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
ci_event: Scheduled CI (AMD) - mi250
report_repo_id: optimum-amd/transformers_daily_ci
secrets: inherit

View File

@@ -54,6 +54,7 @@ jobs:
runner: daily-ci
docker: huggingface/transformers-all-latest-gpu
ci_event: Daily CI
report_repo_id: hf-internal-testing/transformers_daily_ci
secrets: inherit
torch-pipeline:
@@ -65,6 +66,7 @@ jobs:
runner: daily-ci
docker: huggingface/transformers-pytorch-gpu
ci_event: Daily CI
report_repo_id: hf-internal-testing/transformers_daily_ci
secrets: inherit
tf-pipeline:
@@ -76,6 +78,7 @@ jobs:
runner: daily-ci
docker: huggingface/transformers-tensorflow-gpu
ci_event: Daily CI
report_repo_id: hf-internal-testing/transformers_daily_ci
secrets: inherit
example-ci:
@@ -87,6 +90,7 @@ jobs:
runner: daily-ci
docker: huggingface/transformers-all-latest-gpu
ci_event: Daily CI
report_repo_id: hf-internal-testing/transformers_daily_ci
secrets: inherit
trainer-fsdp-ci:
@@ -98,6 +102,7 @@ jobs:
runner: daily-ci
docker: huggingface/transformers-all-latest-gpu
ci_event: Daily CI
report_repo_id: hf-internal-testing/transformers_daily_ci
secrets: inherit
deepspeed-ci:
@@ -110,6 +115,7 @@ jobs:
docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
ci_event: Daily CI
working-directory-prefix: /workspace
report_repo_id: hf-internal-testing/transformers_daily_ci
secrets: inherit
quantization-ci:
@@ -121,4 +127,5 @@ jobs:
runner: daily-ci
docker: huggingface/transformers-quantization-latest-gpu
ci_event: Daily CI
report_repo_id: hf-internal-testing/transformers_daily_ci
secrets: inherit

View File

@@ -28,6 +28,10 @@ on:
default: ''
required: false
type: string
report_repo_id:
required: true
type: string
env:
HF_HOME: /mnt/cache
@@ -584,15 +588,22 @@ jobs:
folder_slices: ${{ needs.setup.outputs.folder_slices }}
quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
ci_event: ${{ inputs.ci_event }}
report_repo_id: ${{ inputs.report_repo_id }}
secrets: inherit
check_new_model_failures:
if: ${{ always() && inputs.ci_event == 'Daily CI' && inputs.job == 'run_models_gpu' && needs.send_results.result == 'success' }}
name: Check new model failures
check_new_failures:
# TODO: work on `run_quantization_torch_gpu`
if: ${{ always() && inputs.ci_event == 'Daily CI' && inputs.job != 'run_quantization_torch_gpu' && needs.send_results.result == 'success' }}
name: Check new failures
needs: send_results
uses: ./.github/workflows/check_failed_model_tests.yml
uses: ./.github/workflows/check_failed_tests.yml
with:
docker: ${{ inputs.docker }}
start_sha: ${{ github.sha }}
job: ${{ inputs.job }}
slack_report_channel: ${{ inputs.slack_report_channel }}
ci_event: ${{ inputs.ci_event }}
report_repo_id: ${{ inputs.report_repo_id }}
secrets: inherit

View File

@@ -21,6 +21,9 @@ on:
ci_event:
required: true
type: string
report_repo_id:
required: true
type: string
env:
TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
@@ -67,6 +70,7 @@ jobs:
CI_SHA: ${{ github.sha }}
CI_TEST_JOB: ${{ inputs.job }}
SETUP_STATUS: ${{ inputs.setup_status }}
REPORT_REPO_ID: ${{ inputs.report_repo_id }}
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
# For a job that doesn't depend on (i.e. `needs`) `setup`, the value for `inputs.folder_slices` would be an
@@ -96,6 +100,7 @@ jobs:
CI_SHA: ${{ github.sha }}
CI_TEST_JOB: ${{ inputs.job }}
SETUP_STATUS: ${{ inputs.setup_status }}
REPORT_REPO_ID: ${{ inputs.report_repo_id }}
# We pass `needs.setup.outputs.quantization_matrix` as the argument. A processing in `notification_service_quantization.py` to change
# `quantization/bnb` to `quantization_bnb` is required, as the artifact names use `_` instead of `/`.
run: |