Update CI with nightly torch workflow file (#40306)
* fix nightly ci * Apply suggestions from code review Co-authored-by: ivarflakstad <69173633+ivarflakstad@users.noreply.github.com> --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com> Co-authored-by: ivarflakstad <69173633+ivarflakstad@users.noreply.github.com>
This commit is contained in:
5
.github/workflows/check_failed_tests.yml
vendored
5
.github/workflows/check_failed_tests.yml
vendored
@@ -21,6 +21,9 @@ on:
|
||||
report_repo_id:
|
||||
required: true
|
||||
type: string
|
||||
commit_sha:
|
||||
required: false
|
||||
type: string
|
||||
|
||||
|
||||
env:
|
||||
@@ -87,7 +90,7 @@ jobs:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
if: ${{ env.process == 'true' }}
|
||||
run: git fetch && git checkout ${{ github.sha }}
|
||||
run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
|
||||
|
||||
- name: Get target commit
|
||||
working-directory: /transformers/utils
|
||||
|
||||
5
.github/workflows/model_jobs.yml
vendored
5
.github/workflows/model_jobs.yml
vendored
@@ -18,6 +18,9 @@ on:
|
||||
docker:
|
||||
required: true
|
||||
type: string
|
||||
commit_sha:
|
||||
required: false
|
||||
type: string
|
||||
report_name_prefix:
|
||||
required: false
|
||||
default: run_models_gpu
|
||||
@@ -70,7 +73,7 @@ jobs:
|
||||
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
run: git fetch && git checkout ${{ github.sha }}
|
||||
run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: /transformers
|
||||
|
||||
37
.github/workflows/self-nightly-caller.yml
vendored
37
.github/workflows/self-nightly-caller.yml
vendored
@@ -1,43 +1,32 @@
|
||||
name: Self-hosted runner (nightly-ci)
|
||||
|
||||
name: Nvidia CI with nightly torch
|
||||
|
||||
on:
|
||||
repository_dispatch:
|
||||
schedule:
|
||||
- cron: "17 2 * * *"
|
||||
# triggered when the daily scheduled Nvidia CI is completed.
|
||||
# This way, we can compare the results more easily.
|
||||
workflow_run:
|
||||
workflows: ["Nvidia CI"]
|
||||
branches: ["main"]
|
||||
types: [completed]
|
||||
push:
|
||||
branches:
|
||||
- run_nightly_ci*
|
||||
- run_ci_with_nightly_torch*
|
||||
|
||||
jobs:
|
||||
build_nightly_ci_images:
|
||||
name: Build Nightly CI Docker Images
|
||||
if: (github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_nightly_ci'))
|
||||
build_nightly_torch_ci_images:
|
||||
name: Build CI Docker Images with nightly torch
|
||||
uses: ./.github/workflows/build-nightly-ci-docker-images.yml
|
||||
secrets: inherit
|
||||
|
||||
model-ci:
|
||||
name: Model CI
|
||||
needs: [build_nightly_ci_images]
|
||||
needs: build_nightly_torch_ci_images
|
||||
uses: ./.github/workflows/self-scheduled.yml
|
||||
with:
|
||||
job: run_models_gpu
|
||||
slack_report_channel: "#transformers-ci-past-future"
|
||||
runner: ci
|
||||
docker: huggingface/transformers-all-latest-torch-nightly-gpu
|
||||
ci_event: Nightly CI
|
||||
secrets: inherit
|
||||
|
||||
deepspeed-ci:
|
||||
name: DeepSpeed CI
|
||||
needs: [build_nightly_ci_images]
|
||||
uses: ./.github/workflows/self-scheduled.yml
|
||||
with:
|
||||
job: run_torch_cuda_extensions_gpu
|
||||
slack_report_channel: "#transformers-ci-past-future"
|
||||
runner: ci
|
||||
# test deepspeed nightly build with the latest release torch
|
||||
docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
|
||||
ci_event: Nightly CI
|
||||
working-directory-prefix: /workspace
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci_with_torch_nightly
|
||||
commit_sha: ${{ github.event.workflow_run.head_sha || github.sha }}
|
||||
secrets: inherit
|
||||
|
||||
11
.github/workflows/self-scheduled-caller.yml
vendored
11
.github/workflows/self-scheduled-caller.yml
vendored
@@ -1,5 +1,4 @@
|
||||
name: Self-hosted runner (scheduled)
|
||||
|
||||
name: Nvidia CI
|
||||
|
||||
on:
|
||||
repository_dispatch:
|
||||
@@ -7,7 +6,7 @@ on:
|
||||
- cron: "17 2 * * *"
|
||||
push:
|
||||
branches:
|
||||
- run_scheduled_ci*
|
||||
- run_nvidia_ci*
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
prev_workflow_run_id:
|
||||
@@ -54,6 +53,7 @@ jobs:
|
||||
docker: huggingface/transformers-all-latest-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
commit_sha: ${{ github.sha }}
|
||||
secrets: inherit
|
||||
|
||||
torch-pipeline:
|
||||
@@ -65,6 +65,7 @@ jobs:
|
||||
docker: huggingface/transformers-pytorch-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
commit_sha: ${{ github.sha }}
|
||||
secrets: inherit
|
||||
|
||||
example-ci:
|
||||
@@ -76,6 +77,7 @@ jobs:
|
||||
docker: huggingface/transformers-all-latest-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
commit_sha: ${{ github.sha }}
|
||||
secrets: inherit
|
||||
|
||||
trainer-fsdp-ci:
|
||||
@@ -87,6 +89,7 @@ jobs:
|
||||
docker: huggingface/transformers-all-latest-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
commit_sha: ${{ github.sha }}
|
||||
secrets: inherit
|
||||
|
||||
deepspeed-ci:
|
||||
@@ -99,6 +102,7 @@ jobs:
|
||||
ci_event: Daily CI
|
||||
working-directory-prefix: /workspace
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
commit_sha: ${{ github.sha }}
|
||||
secrets: inherit
|
||||
|
||||
quantization-ci:
|
||||
@@ -110,4 +114,5 @@ jobs:
|
||||
docker: huggingface/transformers-quantization-latest-gpu
|
||||
ci_event: Daily CI
|
||||
report_repo_id: hf-internal-testing/transformers_daily_ci
|
||||
commit_sha: ${{ github.sha }}
|
||||
secrets: inherit
|
||||
|
||||
21
.github/workflows/self-scheduled.yml
vendored
21
.github/workflows/self-scheduled.yml
vendored
@@ -1,4 +1,4 @@
|
||||
name: Self-hosted runner (scheduled)
|
||||
name: Nvidia CI (job definitions)
|
||||
|
||||
# Note that each job's dependencies go into a corresponding docker file.
|
||||
#
|
||||
@@ -28,6 +28,9 @@ on:
|
||||
report_repo_id:
|
||||
required: true
|
||||
type: string
|
||||
commit_sha:
|
||||
required: false
|
||||
type: string
|
||||
|
||||
|
||||
env:
|
||||
@@ -46,8 +49,8 @@ env:
|
||||
|
||||
jobs:
|
||||
setup:
|
||||
if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu", "run_quantization_torch_gpu"]'), inputs.job)
|
||||
name: Setup
|
||||
if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu", "run_quantization_torch_gpu"]'), inputs.job)
|
||||
strategy:
|
||||
matrix:
|
||||
machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
|
||||
@@ -119,6 +122,7 @@ jobs:
|
||||
slice_id: ${{ matrix.slice_id }}
|
||||
runner_map: ${{ needs.setup.outputs.runner_map }}
|
||||
docker: ${{ inputs.docker }}
|
||||
commit_sha: ${{ inputs.commit_sha || github.sha }}
|
||||
secrets: inherit
|
||||
|
||||
run_trainer_and_fsdp_gpu:
|
||||
@@ -137,6 +141,7 @@ jobs:
|
||||
slice_id: ${{ matrix.slice_id }}
|
||||
runner_map: ${{ needs.setup.outputs.runner_map }}
|
||||
docker: ${{ inputs.docker }}
|
||||
commit_sha: ${{ inputs.commit_sha || github.sha }}
|
||||
report_name_prefix: run_trainer_and_fsdp_gpu
|
||||
secrets: inherit
|
||||
|
||||
@@ -155,7 +160,7 @@ jobs:
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
run: git fetch && git checkout ${{ github.sha }}
|
||||
run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: /transformers
|
||||
@@ -223,7 +228,7 @@ jobs:
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
run: git fetch && git checkout ${{ github.sha }}
|
||||
run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: /transformers
|
||||
@@ -292,7 +297,7 @@ jobs:
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: ${{ inputs.working-directory-prefix }}/transformers
|
||||
run: git fetch && git checkout ${{ github.sha }}
|
||||
run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: ${{ inputs.working-directory-prefix }}/transformers
|
||||
@@ -400,7 +405,7 @@ jobs:
|
||||
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
run: git fetch && git checkout ${{ github.sha }}
|
||||
run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: /transformers
|
||||
@@ -464,6 +469,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 2
|
||||
ref: ${{ inputs.commit_sha || github.sha }}
|
||||
|
||||
- name: Install transformers
|
||||
run: pip install transformers
|
||||
@@ -518,6 +524,7 @@ jobs:
|
||||
quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
|
||||
ci_event: ${{ inputs.ci_event }}
|
||||
report_repo_id: ${{ inputs.report_repo_id }}
|
||||
commit_sha: ${{ inputs.commit_sha || github.sha }}
|
||||
|
||||
secrets: inherit
|
||||
|
||||
@@ -528,7 +535,7 @@ jobs:
|
||||
uses: ./.github/workflows/check_failed_tests.yml
|
||||
with:
|
||||
docker: ${{ inputs.docker }}
|
||||
start_sha: ${{ github.sha }}
|
||||
start_sha: ${{ inputs.commit_sha || github.sha }}
|
||||
job: ${{ inputs.job }}
|
||||
slack_report_channel: ${{ inputs.slack_report_channel }}
|
||||
ci_event: ${{ inputs.ci_event }}
|
||||
|
||||
10
.github/workflows/slack-report.yml
vendored
10
.github/workflows/slack-report.yml
vendored
@@ -24,6 +24,10 @@ on:
|
||||
report_repo_id:
|
||||
required: true
|
||||
type: string
|
||||
commit_sha:
|
||||
required: false
|
||||
type: string
|
||||
|
||||
|
||||
env:
|
||||
TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
|
||||
@@ -41,6 +45,10 @@ jobs:
|
||||
echo "Setup status: ${{ inputs.setup_status }}"
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 2
|
||||
ref: ${{ inputs.commit_sha || github.sha }}
|
||||
|
||||
- uses: actions/download-artifact@v4
|
||||
|
||||
- name: Prepare some setup values
|
||||
@@ -67,7 +75,7 @@ jobs:
|
||||
SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }}
|
||||
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||
CI_EVENT: ${{ inputs.ci_event }}
|
||||
CI_SHA: ${{ github.sha }}
|
||||
CI_SHA: ${{ inputs.commit_sha || github.sha }}
|
||||
CI_TEST_JOB: ${{ inputs.job }}
|
||||
SETUP_STATUS: ${{ inputs.setup_status }}
|
||||
REPORT_REPO_ID: ${{ inputs.report_repo_id }}
|
||||
|
||||
Reference in New Issue
Block a user