refactor: benchmarks (#33896)
* refactor: benchmarks Based on a discussion with @LysandreJik & @ArthurZucker, the goal of this PR is to improve transformers' benchmark system. This is a WIP, for the moment the infrastructure required to make things work is not ready. Will update the PR description when it is the case. * feat: add db init in benchmarks CI * fix: pg_config is missing in runner * fix: add psql to the runner * fix: connect info from env vars + PR comments * refactor: set database as env var * fix: invalid working directory * fix: `commit_msg` -> `commit_message` * fix: git marking checked out repo as unsafe * feat: add logging * fix: invalid device * feat: update grafana dashboard for prod grafana * feat: add `commit_id` to header table * feat: commit latest version of dashboard * feat: move measurements into json field * feat: remove drop table migration queries * fix: `torch.arrange` -> `torch.arange` * fix: add missing `s` to `cache_position` positional argument * fix: change model * revert: `cache_positions` -> `cache_position` * fix: set device for `StaticCache` * fix: set `StaticCache` dtype * feat: limit max cache len * fix script * raise error on failure! * not try catch * try to skip generate compilation * update * update docker image! * update * update again!@ * update * updates * ??? * ?? * use `torch.cuda.synchronize()` * fix json * nits * fix * fixed! * f**k * feat: add TTNT panels * feat: add try except --------- Co-authored-by: Arthur Zucker <arthur.zucker@gmail.com>
This commit is contained in:
73
.github/workflows/benchmark.yml
vendored
73
.github/workflows/benchmark.yml
vendored
@@ -1,43 +1,72 @@
|
||||
name: Self-hosted runner (benchmark)
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: "17 2 * * *"
|
||||
workflow_call:
|
||||
push:
|
||||
branches: [main]
|
||||
pull_request:
|
||||
types: [ opened, labeled, reopened, synchronize ]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
HF_HOME: /mnt/cache
|
||||
TF_FORCE_GPU_ALLOW_GROWTH: true
|
||||
|
||||
|
||||
jobs:
|
||||
benchmark:
|
||||
name: Benchmark
|
||||
runs-on:
|
||||
runs-on:
|
||||
group: aws-g5-4xlarge-cache
|
||||
container:
|
||||
image: huggingface/transformers-all-latest-gpu
|
||||
options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
|
||||
image: huggingface/transformers-pytorch-gpu
|
||||
options: --gpus all --privileged --ipc host
|
||||
steps:
|
||||
- name: Update clone
|
||||
working-directory: /transformers
|
||||
- name: Get repo
|
||||
if: github.event_name == 'pull_request'
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event.pull_request.head.sha }}
|
||||
|
||||
- name: Get repo
|
||||
if: github.event_name == 'push'
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.sha }}
|
||||
|
||||
- name: Install libpq-dev & psql
|
||||
run: |
|
||||
git fetch && git checkout ${{ github.sha }}
|
||||
apt update
|
||||
apt install -y libpq-dev postgresql-client
|
||||
|
||||
- name: Install benchmark script dependencies
|
||||
run: python3 -m pip install -r benchmark/requirements.txt
|
||||
|
||||
- name: Reinstall transformers in edit mode (remove the one installed during docker image build)
|
||||
working-directory: /transformers
|
||||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
|
||||
run: python3 -m pip uninstall -y transformers && python3 -m pip install -e ".[torch]"
|
||||
|
||||
- name: Benchmark (daily)
|
||||
if: github.event_name == 'schedule'
|
||||
working-directory: /transformers
|
||||
- name: Run database init script
|
||||
run: |
|
||||
python3 -m pip install optimum-benchmark>=0.3.0
|
||||
HF_TOKEN=${{ secrets.TRANSFORMERS_BENCHMARK_TOKEN }} python3 benchmark/benchmark.py --repo_id hf-internal-testing/benchmark_results --path_in_repo $(date +'%Y-%m-%d') --config-dir benchmark/config --config-name generation --commit=${{ github.sha }} backend.model=google/gemma-2b backend.cache_implementation=null,static backend.torch_compile=false,true --multirun
|
||||
psql -f benchmark/init_db.sql
|
||||
env:
|
||||
PGDATABASE: metrics
|
||||
PGHOST: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGHOST }}
|
||||
PGUSER: transformers_benchmarks
|
||||
PGPASSWORD: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGPASSWORD }}
|
||||
|
||||
- name: Benchmark (merged to main event)
|
||||
if: github.event_name == 'push' && github.ref_name == 'main'
|
||||
working-directory: /transformers
|
||||
- name: Run benchmark
|
||||
run: |
|
||||
python3 -m pip install optimum-benchmark>=0.3.0
|
||||
HF_TOKEN=${{ secrets.TRANSFORMERS_BENCHMARK_TOKEN }} python3 benchmark/benchmark.py --repo_id hf-internal-testing/benchmark_results_merge_event --path_in_repo $(date +'%Y-%m-%d') --config-dir benchmark/config --config-name generation --commit=${{ github.sha }} backend.model=google/gemma-2b backend.cache_implementation=null,static backend.torch_compile=false,true --multirun
|
||||
git config --global --add safe.directory /__w/transformers/transformers
|
||||
if [ "$GITHUB_EVENT_NAME" = "pull_request" ]; then
|
||||
commit_id=$(echo "${{ github.event.pull_request.head.sha }}")
|
||||
elif [ "$GITHUB_EVENT_NAME" = "push" ]; then
|
||||
commit_id=$GITHUB_SHA
|
||||
fi
|
||||
commit_msg=$(git show -s --format=%s | cut -c1-70)
|
||||
python3 benchmark/llama.py "${{ github.head_ref || github.ref_name }}" "$commit_id" "$commit_msg"
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
|
||||
PGHOST: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGHOST }}
|
||||
PGUSER: transformers_benchmarks
|
||||
PGPASSWORD: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGPASSWORD }}
|
||||
|
||||
Reference in New Issue
Block a user