Qdqbert example add benchmark script with ORT-TRT (#16592)
* add ort-trt benchmark script * Update README.md * ort version can be newer * formatting * specify ORT version
This commit is contained in:
@@ -23,6 +23,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip
|
||||
RUN python3 -m pip install --no-cache-dir --ignore-installed pycuda
|
||||
RUN python3 -m pip install --no-cache-dir \
|
||||
pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com
|
||||
RUN python3 -m pip install --no-cache-dir onnxruntime-gpu==1.11
|
||||
|
||||
WORKDIR /workspace
|
||||
COPY . transformers/
|
||||
|
||||
@@ -101,6 +101,12 @@ Recalibrating will affect the accuracy of the model, but the change should be mi
|
||||
trtexec --onnx=model.onnx --explicitBatch --workspace=16384 --int8 --shapes=input_ids:64x128,attention_mask:64x128,token_type_ids:64x128 --verbose
|
||||
```
|
||||
|
||||
### Benchmark the INT8 QAT ONNX model inference with [ONNX Runtime-TRT](https://onnxruntime.ai/docs/execution-providers/TensorRT-ExecutionProvider.html) using dummy input
|
||||
|
||||
```
|
||||
python3 ort-infer-benchmark.py
|
||||
```
|
||||
|
||||
### Evaluate the INT8 QAT ONNX model inference with TensorRT
|
||||
|
||||
```
|
||||
|
||||
@@ -0,0 +1,51 @@
|
||||
import os
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
|
||||
import onnxruntime as ort
|
||||
|
||||
|
||||
os.environ["ORT_TENSORRT_INT8_ENABLE"] = "1"
|
||||
os.environ["ORT_TENSORRT_INT8_USE_NATIVE_CALIBRATION_TABLE"] = "0"
|
||||
os.environ["ORT_TENSORRT_ENGINE_CACHE_ENABLE"] = "1"
|
||||
|
||||
sess_opt = ort.SessionOptions()
|
||||
sess_opt.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
|
||||
print("Create inference session...")
|
||||
execution_provider = ["TensorrtExecutionProvider", "CUDAExecutionProvider"]
|
||||
sess = ort.InferenceSession("model.onnx", sess_options=sess_opt, providers=execution_provider)
|
||||
run_opt = ort.RunOptions()
|
||||
|
||||
sequence = 128
|
||||
batch = 1
|
||||
input_ids = np.ones((batch, sequence), dtype=np.int64)
|
||||
attention_mask = np.ones((batch, sequence), dtype=np.int64)
|
||||
token_type_ids = np.ones((batch, sequence), dtype=np.int64)
|
||||
|
||||
print("Warm up phase...")
|
||||
sess.run(
|
||||
None,
|
||||
{
|
||||
sess.get_inputs()[0].name: input_ids,
|
||||
sess.get_inputs()[1].name: attention_mask,
|
||||
sess.get_inputs()[2].name: token_type_ids,
|
||||
},
|
||||
run_options=run_opt,
|
||||
)
|
||||
|
||||
print("Start inference...")
|
||||
start_time = time.time()
|
||||
max_iters = 2000
|
||||
predict = {}
|
||||
for iter in range(max_iters):
|
||||
predict = sess.run(
|
||||
None,
|
||||
{
|
||||
sess.get_inputs()[0].name: input_ids,
|
||||
sess.get_inputs()[1].name: attention_mask,
|
||||
sess.get_inputs()[2].name: token_type_ids,
|
||||
},
|
||||
run_options=run_opt,
|
||||
)
|
||||
print("Average Inference Time = {:.3f} ms".format((time.time() - start_time) * 1000 / max_iters))
|
||||
Reference in New Issue
Block a user