From 14daa6102a0e8a35ef734dd21bfcf31d9b0207d1 Mon Sep 17 00:00:00 2001
From: Shang Zhang <69697986+shangz-ai@users.noreply.github.com>
Date: Tue, 12 Apr 2022 08:13:59 -0700
Subject: [PATCH] Qdqbert example add benchmark script with ORT-TRT (#16592)

* add ort-trt benchmark script

* Update README.md

* ort version can be newer

* formatting

* specify ORT version
---
 .../quantization-qdqbert/Dockerfile           |  1 +
 .../quantization-qdqbert/README.md            |  6 +++
 .../ort-infer-benchmark.py                    | 51 +++++++++++++++++++
 3 files changed, 58 insertions(+)
 create mode 100644 examples/research_projects/quantization-qdqbert/ort-infer-benchmark.py

diff --git a/examples/research_projects/quantization-qdqbert/Dockerfile b/examples/research_projects/quantization-qdqbert/Dockerfile
index 2a6604d6e6..e64c9f0e02 100644
--- a/examples/research_projects/quantization-qdqbert/Dockerfile
+++ b/examples/research_projects/quantization-qdqbert/Dockerfile
@@ -23,6 +23,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip
 RUN python3 -m pip install --no-cache-dir --ignore-installed pycuda
 RUN python3 -m pip install --no-cache-dir \
     pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com
+RUN python3 -m pip install --no-cache-dir onnxruntime-gpu==1.11
 
 WORKDIR /workspace
 COPY . transformers/
diff --git a/examples/research_projects/quantization-qdqbert/README.md b/examples/research_projects/quantization-qdqbert/README.md
index 9c8ec36b04..fe69819cc5 100644
--- a/examples/research_projects/quantization-qdqbert/README.md
+++ b/examples/research_projects/quantization-qdqbert/README.md
@@ -101,6 +101,12 @@ Recalibrating will affect the accuracy of the model, but the change should be mi
 trtexec --onnx=model.onnx --explicitBatch --workspace=16384 --int8 --shapes=input_ids:64x128,attention_mask:64x128,token_type_ids:64x128 --verbose
 ```
 
+### Benchmark the INT8 QAT ONNX model inference with [ONNX Runtime-TRT](https://onnxruntime.ai/docs/execution-providers/TensorRT-ExecutionProvider.html) using dummy input
+
+```
+python3 ort-infer-benchmark.py
+```
+
 ### Evaluate the INT8 QAT ONNX model inference with TensorRT
 
 ```
diff --git a/examples/research_projects/quantization-qdqbert/ort-infer-benchmark.py b/examples/research_projects/quantization-qdqbert/ort-infer-benchmark.py
new file mode 100644
index 0000000000..4ed4203062
--- /dev/null
+++ b/examples/research_projects/quantization-qdqbert/ort-infer-benchmark.py
@@ -0,0 +1,51 @@
+import os
+import time
+
+import numpy as np
+
+import onnxruntime as ort
+
+
+os.environ["ORT_TENSORRT_INT8_ENABLE"] = "1"
+os.environ["ORT_TENSORRT_INT8_USE_NATIVE_CALIBRATION_TABLE"] = "0"
+os.environ["ORT_TENSORRT_ENGINE_CACHE_ENABLE"] = "1"
+
+sess_opt = ort.SessionOptions()
+sess_opt.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
+print("Create inference session...")
+execution_provider = ["TensorrtExecutionProvider", "CUDAExecutionProvider"]
+sess = ort.InferenceSession("model.onnx", sess_options=sess_opt, providers=execution_provider)
+run_opt = ort.RunOptions()
+
+sequence = 128
+batch = 1
+input_ids = np.ones((batch, sequence), dtype=np.int64)
+attention_mask = np.ones((batch, sequence), dtype=np.int64)
+token_type_ids = np.ones((batch, sequence), dtype=np.int64)
+
+print("Warm up phase...")
+sess.run(
+    None,
+    {
+        sess.get_inputs()[0].name: input_ids,
+        sess.get_inputs()[1].name: attention_mask,
+        sess.get_inputs()[2].name: token_type_ids,
+    },
+    run_options=run_opt,
+)
+
+print("Start inference...")
+start_time = time.time()
+max_iters = 2000
+predict = {}
+for iter in range(max_iters):
+    predict = sess.run(
+        None,
+        {
+            sess.get_inputs()[0].name: input_ids,
+            sess.get_inputs()[1].name: attention_mask,
+            sess.get_inputs()[2].name: token_type_ids,
+        },
+        run_options=run_opt,
+    )
+print("Average Inference Time = {:.3f} ms".format((time.time() - start_time) * 1000 / max_iters))