feat: run benchmarks on A100 (#34287)

2024-10-28 19:33:17 +01:00
parent d21dbd1520
commit 6cc4a67b3d
3 changed files with 943 additions and 779 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -16,8 +16,11 @@ env:
 jobs:
  benchmark:
    name: Benchmark
    strategy:
      matrix:
        group: [aws-g5-4xlarge-cache, aws-p4d-24xlarge-plus]
    runs-on:
-      group: aws-g5-4xlarge-cache
+      group: ${{ matrix.group }}
    if: |
      (github.event_name == 'pull_request' && contains( github.event.pull_request.labels.*.name, 'run-benchmark') )||
      (github.event_name == 'push' && github.ref == 'refs/heads/main')
@@ -60,9 +63,13 @@ jobs:
            commit_id=$GITHUB_SHA
          fi
          commit_msg=$(git show -s --format=%s | cut -c1-70)
          df -h
          python3 benchmark/llama.py "${{ github.head_ref || github.ref_name }}" "$commit_id" "$commit_msg"
        env:
          HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
          # Enable this to see debug logs
          # HF_HUB_VERBOSITY: debug
          # TRANSFORMERS_VERBOSITY: debug
          PGHOST: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGHOST }}
          PGUSER: transformers_benchmarks
          PGPASSWORD: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGPASSWORD }}
--- a/benchmark/grafana_dashboard.json
+++ b/benchmark/grafana_dashboard.json
--- a/benchmark/llama.py
+++ b/benchmark/llama.py
@@ -96,17 +96,21 @@ def run_benchmark(branch: str, commit_id: str, commit_msg: str, num_tokens_to_ge
        )
        conn.commit()
        benchmark_id = cur.fetchone()[0]
        logger.info(f"running benchmark #{benchmark_id} on {gpu_name}")
        metrics_thread = Thread(target=collect_metrics, args=[benchmark_id, continue_metric_collection])
        metrics_thread.start()
        logger.info("started background thread to fetch device metrics")
        os.environ["TOKENIZERS_PARALLELISM"] = "false"  # silence warnings when compiling
        device = "cuda"
        ckpt = "meta-llama/Llama-2-7b-hf"
        logger.info("downloading weights")
        # This is to avoid counting download in model load time measurement
        model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16)
        gen_config = GenerationConfig(do_sample=False, top_p=1, temperature=1)
        logger.info("loading model")
        start = perf_counter()
        model = AutoModelForCausalLM.from_pretrained(
            ckpt, torch_dtype=torch.float16, generation_config=gen_config