From 14daa6102a0e8a35ef734dd21bfcf31d9b0207d1 Mon Sep 17 00:00:00 2001 From: Shang Zhang <69697986+shangz-ai@users.noreply.github.com> Date: Tue, 12 Apr 2022 08:13:59 -0700 Subject: [PATCH] Qdqbert example add benchmark script with ORT-TRT (#16592) * add ort-trt benchmark script * Update README.md * ort version can be newer * formatting * specify ORT version --- .../quantization-qdqbert/Dockerfile | 1 + .../quantization-qdqbert/README.md | 6 +++ .../ort-infer-benchmark.py | 51 +++++++++++++++++++ 3 files changed, 58 insertions(+) create mode 100644 examples/research_projects/quantization-qdqbert/ort-infer-benchmark.py diff --git a/examples/research_projects/quantization-qdqbert/Dockerfile b/examples/research_projects/quantization-qdqbert/Dockerfile index 2a6604d6e6..e64c9f0e02 100644 --- a/examples/research_projects/quantization-qdqbert/Dockerfile +++ b/examples/research_projects/quantization-qdqbert/Dockerfile @@ -23,6 +23,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip RUN python3 -m pip install --no-cache-dir --ignore-installed pycuda RUN python3 -m pip install --no-cache-dir \ pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com +RUN python3 -m pip install --no-cache-dir onnxruntime-gpu==1.11 WORKDIR /workspace COPY . transformers/ diff --git a/examples/research_projects/quantization-qdqbert/README.md b/examples/research_projects/quantization-qdqbert/README.md index 9c8ec36b04..fe69819cc5 100644 --- a/examples/research_projects/quantization-qdqbert/README.md +++ b/examples/research_projects/quantization-qdqbert/README.md @@ -101,6 +101,12 @@ Recalibrating will affect the accuracy of the model, but the change should be mi trtexec --onnx=model.onnx --explicitBatch --workspace=16384 --int8 --shapes=input_ids:64x128,attention_mask:64x128,token_type_ids:64x128 --verbose ``` +### Benchmark the INT8 QAT ONNX model inference with [ONNX Runtime-TRT](https://onnxruntime.ai/docs/execution-providers/TensorRT-ExecutionProvider.html) using dummy input + +``` +python3 ort-infer-benchmark.py +``` + ### Evaluate the INT8 QAT ONNX model inference with TensorRT ``` diff --git a/examples/research_projects/quantization-qdqbert/ort-infer-benchmark.py b/examples/research_projects/quantization-qdqbert/ort-infer-benchmark.py new file mode 100644 index 0000000000..4ed4203062 --- /dev/null +++ b/examples/research_projects/quantization-qdqbert/ort-infer-benchmark.py @@ -0,0 +1,51 @@ +import os +import time + +import numpy as np + +import onnxruntime as ort + + +os.environ["ORT_TENSORRT_INT8_ENABLE"] = "1" +os.environ["ORT_TENSORRT_INT8_USE_NATIVE_CALIBRATION_TABLE"] = "0" +os.environ["ORT_TENSORRT_ENGINE_CACHE_ENABLE"] = "1" + +sess_opt = ort.SessionOptions() +sess_opt.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL +print("Create inference session...") +execution_provider = ["TensorrtExecutionProvider", "CUDAExecutionProvider"] +sess = ort.InferenceSession("model.onnx", sess_options=sess_opt, providers=execution_provider) +run_opt = ort.RunOptions() + +sequence = 128 +batch = 1 +input_ids = np.ones((batch, sequence), dtype=np.int64) +attention_mask = np.ones((batch, sequence), dtype=np.int64) +token_type_ids = np.ones((batch, sequence), dtype=np.int64) + +print("Warm up phase...") +sess.run( + None, + { + sess.get_inputs()[0].name: input_ids, + sess.get_inputs()[1].name: attention_mask, + sess.get_inputs()[2].name: token_type_ids, + }, + run_options=run_opt, +) + +print("Start inference...") +start_time = time.time() +max_iters = 2000 +predict = {} +for iter in range(max_iters): + predict = sess.run( + None, + { + sess.get_inputs()[0].name: input_ids, + sess.get_inputs()[1].name: attention_mask, + sess.get_inputs()[2].name: token_type_ids, + }, + run_options=run_opt, + ) +print("Average Inference Time = {:.3f} ms".format((time.time() - start_time) * 1000 / max_iters))