Better way to run AMD CI with different flavors (#26634)
* Enable testing against mi250 * Change BERT to trigger tests * Revert BERT's change * AMD CI * AMD CI --------- Co-authored-by: Morgan Funtowicz <funtowiczmo@gmail.com> Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
25
.github/workflows/self-push-amd-mi210-caller.yml
vendored
Normal file
25
.github/workflows/self-push-amd-mi210-caller.yml
vendored
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
name: Self-hosted runner (AMD mi210 CI caller)
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_run:
|
||||||
|
workflows: ["Self-hosted runner (push-caller)"]
|
||||||
|
branches: ["main"]
|
||||||
|
types: [completed]
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- run_amd_push_ci_caller*
|
||||||
|
paths:
|
||||||
|
- "src/**"
|
||||||
|
- "tests/**"
|
||||||
|
- ".github/**"
|
||||||
|
- "templates/**"
|
||||||
|
- "utils/**"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
run_amd_ci:
|
||||||
|
name: AMD mi210
|
||||||
|
if: (cancelled() != true) && ((github.event_name != 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
|
||||||
|
uses: ./.github/workflows/self-push-amd.yml
|
||||||
|
with:
|
||||||
|
gpu_flavor: mi210
|
||||||
|
secrets: inherit
|
||||||
25
.github/workflows/self-push-amd-mi250-caller.yml
vendored
Normal file
25
.github/workflows/self-push-amd-mi250-caller.yml
vendored
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
name: Self-hosted runner (AMD mi250 CI caller)
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_run:
|
||||||
|
workflows: ["Self-hosted runner (push-caller)"]
|
||||||
|
branches: ["main"]
|
||||||
|
types: [completed]
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- run_amd_push_ci_caller*
|
||||||
|
paths:
|
||||||
|
- "src/**"
|
||||||
|
- "tests/**"
|
||||||
|
- ".github/**"
|
||||||
|
- "templates/**"
|
||||||
|
- "utils/**"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
run_amd_ci:
|
||||||
|
name: AMD mi250
|
||||||
|
if: (cancelled() != true) && ((github.event_name != 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
|
||||||
|
uses: ./.github/workflows/self-push-amd.yml
|
||||||
|
with:
|
||||||
|
gpu_flavor: mi250
|
||||||
|
secrets: inherit
|
||||||
31
.github/workflows/self-push-amd.yml
vendored
31
.github/workflows/self-push-amd.yml
vendored
@@ -1,21 +1,11 @@
|
|||||||
name: Self-hosted runner AMD GPU (push)
|
name: Self-hosted runner AMD GPU (push)
|
||||||
|
|
||||||
on:
|
on:
|
||||||
workflow_run:
|
workflow_call:
|
||||||
workflows: ["Self-hosted runner (push-caller)"]
|
inputs:
|
||||||
branches: ["main"]
|
gpu_flavor:
|
||||||
types: [completed]
|
required: true
|
||||||
push:
|
type: string
|
||||||
branches:
|
|
||||||
- ci_*
|
|
||||||
- ci-*
|
|
||||||
paths:
|
|
||||||
- "src/**"
|
|
||||||
- "tests/**"
|
|
||||||
- ".github/**"
|
|
||||||
- "templates/**"
|
|
||||||
- "utils/**"
|
|
||||||
repository_dispatch:
|
|
||||||
|
|
||||||
env:
|
env:
|
||||||
HF_HOME: /mnt/cache
|
HF_HOME: /mnt/cache
|
||||||
@@ -45,8 +35,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
machine_type: [single-gpu, multi-gpu]
|
machine_type: [single-gpu, multi-gpu]
|
||||||
gpu_flavor: [mi210]
|
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
|
|
||||||
container:
|
container:
|
||||||
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
||||||
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
@@ -65,8 +54,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
machine_type: [single-gpu, multi-gpu]
|
machine_type: [single-gpu, multi-gpu]
|
||||||
gpu_flavor: [mi210]
|
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
|
|
||||||
container:
|
container:
|
||||||
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
||||||
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
@@ -164,8 +152,7 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }}
|
folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }}
|
||||||
machine_type: [single-gpu, multi-gpu]
|
machine_type: [single-gpu, multi-gpu]
|
||||||
gpu_flavor: [mi210]
|
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
|
||||||
runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
|
|
||||||
container:
|
container:
|
||||||
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now
|
||||||
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||||
@@ -321,7 +308,7 @@ jobs:
|
|||||||
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
|
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
|
||||||
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }}
|
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }}
|
||||||
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
|
||||||
CI_EVENT: push
|
CI_EVENT: Push CI (AMD) - ${{ inputs.gpu_flavor }}
|
||||||
CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
|
CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
|
||||||
CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
|
CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
|
||||||
CI_SHA: ${{ env.CI_SHA }}
|
CI_SHA: ${{ env.CI_SHA }}
|
||||||
|
|||||||
@@ -897,6 +897,9 @@ if __name__ == "__main__":
|
|||||||
job_name_prefix = f"{framework} {version}"
|
job_name_prefix = f"{framework} {version}"
|
||||||
elif ci_event.startswith("Nightly CI"):
|
elif ci_event.startswith("Nightly CI"):
|
||||||
job_name_prefix = "Nightly CI"
|
job_name_prefix = "Nightly CI"
|
||||||
|
elif ci_event.startswith("Push CI (AMD) - "):
|
||||||
|
flavor = ci_event.replace("Push CI (AMD) - ", "")
|
||||||
|
job_name_prefix = f"AMD {flavor}"
|
||||||
|
|
||||||
for model in model_results.keys():
|
for model in model_results.keys():
|
||||||
for artifact_path in available_artifacts[f"run_all_tests_gpu_{model}_test_reports"].paths:
|
for artifact_path in available_artifacts[f"run_all_tests_gpu_{model}_test_reports"].paths:
|
||||||
@@ -962,7 +965,7 @@ if __name__ == "__main__":
|
|||||||
"Torch CUDA extension tests": "run_tests_torch_cuda_extensions_gpu_test_reports",
|
"Torch CUDA extension tests": "run_tests_torch_cuda_extensions_gpu_test_reports",
|
||||||
}
|
}
|
||||||
|
|
||||||
if ci_event in ["push", "Nightly CI"] or ci_event.startswith("Past CI"):
|
if ci_event in ["push", "Nightly CI"] or ci_event.startswith("Past CI") or ci_event.startswith("Push CI (AMD)"):
|
||||||
del additional_files["Examples directory"]
|
del additional_files["Examples directory"]
|
||||||
del additional_files["PyTorch pipelines"]
|
del additional_files["PyTorch pipelines"]
|
||||||
del additional_files["TensorFlow pipelines"]
|
del additional_files["TensorFlow pipelines"]
|
||||||
@@ -1027,6 +1030,6 @@ if __name__ == "__main__":
|
|||||||
message = Message(title, ci_title, model_results, additional_results, selected_warnings=selected_warnings)
|
message = Message(title, ci_title, model_results, additional_results, selected_warnings=selected_warnings)
|
||||||
|
|
||||||
# send report only if there is any failure (for push CI)
|
# send report only if there is any failure (for push CI)
|
||||||
if message.n_failures or ci_event != "push":
|
if message.n_failures or (ci_event != "push" and not ci_event.startswith("Push CI (AMD)")):
|
||||||
message.post()
|
message.post()
|
||||||
message.post_reply()
|
message.post_reply()
|
||||||
|
|||||||
Reference in New Issue
Block a user