diff --git a/docs/source/serialization.rst b/docs/source/serialization.rst index 15a1f3771e..5026d2b7a0 100644 --- a/docs/source/serialization.rst +++ b/docs/source/serialization.rst @@ -5,7 +5,7 @@ Exporting transformers models ONNX / ONNXRuntime ============================================== -Projects ONNX (Open Neural Network eXchange) and ONNXRuntime (ORT) are part of an effort from leading industries in the AI field +Projects `ONNX (Open Neural Network eXchange) `_ and `ONNXRuntime (ORT) `_ are part of an effort from leading industries in the AI field to provide a unified and community-driven format to store and, by extension, efficiently execute neural network leveraging a variety of hardware and dedicated optimizations. @@ -34,9 +34,36 @@ The conversion tool works for both PyTorch and Tensorflow models and ensures: Also, the conversion tool supports different options which let you tune the behavior of the generated model: -* Change the target opset version of the generated model: More recent opset generally supports more operator and enables faster inference. -* Export pipeline specific prediction heads: Allow to export model along with its task-specific prediction head(s). -* Use the external data format (PyTorch only): Lets you export model which size is above 2Gb (`More info `_). +* **Change the target opset version of the generated model.** (More recent opset generally supports more operators and enables faster inference) + +* **Export pipeline-specific prediction heads.** (Allow to export model along with its task-specific prediction head(s)) + +* **Use the external data format (PyTorch only).** (Lets you export model which size is above 2Gb (`More info `_)) + + +Optimizations +------------------------------------------------ + +ONNXRuntime includes some transformers-specific transformations to leverage optimized operations in the graph. +Below are some of the operators which can be enabled to speed up inference through ONNXRuntime (*see note below*): + +* Constant folding +* Attention Layer fusing +* Skip connection LayerNormalization fusing +* FastGeLU approximation + + +Fortunately, you can let ONNXRuntime find all the possible optimized operators for you. Simply add ``--optimize`` +when exporting your model through ``convert_graph_to_onnx.py``. + +Example: + +.. code-block:: bash + + python convert_graph_to_onnx.py --framework --model bert-base-cased --optimize bert-base-cased.onnx + +.. note:: + For more information about the optimizations enabled by ONNXRuntime, please have a look at the (`ONNXRuntime Github `_) Quantization ------------------------------------------------ @@ -85,6 +112,8 @@ Example of quantized BERT model export: above command will contain the original ONNX model storing `float32` weights. The second one, with ``-quantized`` suffix, will hold the quantized parameters. +.. note:: + The quantization export gives the best performances when used in combination with ``--optimize``. TorchScript ======================================= diff --git a/src/transformers/convert_graph_to_onnx.py b/src/transformers/convert_graph_to_onnx.py index 72082ab0b5..c79fe644ba 100644 --- a/src/transformers/convert_graph_to_onnx.py +++ b/src/transformers/convert_graph_to_onnx.py @@ -3,7 +3,7 @@ from os import listdir, makedirs from pathlib import Path from typing import Dict, List, Optional, Tuple -from packaging.version import parse +from packaging.version import Version, parse from transformers import is_tf_available, is_torch_available from transformers.file_utils import ModelOutput @@ -72,7 +72,7 @@ def generate_identified_filename(filename: Path, identifier: str) -> Path: return filename.parent.joinpath(filename.stem + identifier).with_suffix(filename.suffix) -def ensure_onnxruntime_installed(): +def check_onnxruntime_requirements(minimum_version: Version): """ Check onnxruntime is installed and if the installed version match is recent enough. Raises: @@ -88,7 +88,7 @@ def ensure_onnxruntime_installed(): if ort_version < ORT_QUANTIZE_MINIMUM_VERSION: raise ImportError( f"We found an older version of onnxruntime ({onnxruntime.__version__}) " - f"but we require onnxruntime to be >= 1.4.0 to enable all the conversions options.\n" + f"but we require onnxruntime to be >= {minimum_version} to enable all the conversions options.\n" f"Please update onnxruntime by running `pip install --upgrade onnxruntime`" ) @@ -330,6 +330,30 @@ def convert( convert_tensorflow(nlp, opset, output) +def optimize(onnx_model_path: Path) -> Path: + """ + Load the model at the specified path and let onnxruntime look at transformations on the graph + to enable all the optimizations possible + Args: + onnx_model_path: filepath where the model binary description is stored + + Returns: Path where the optimized model binary description has been saved + + """ + from onnxruntime import SessionOptions, InferenceSession + + # Generate model name with suffix "optimized" + opt_model_path = generate_identified_filename(onnx_model_path, "-optimized") + sess_option = SessionOptions() + sess_option.optimized_model_filepath = opt_model_path.as_posix() + _ = InferenceSession(onnx_model_path.as_posix(), sess_option) + + print(f"Optimized model has been written at {opt_model_path}: \N{heavy check mark}") + print("/!\\ Optimized model contains hardware specific operators which might not be portable. /!\\") + + return opt_model_path + + def quantize(onnx_model_path: Path) -> Path: """ Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU. @@ -338,17 +362,18 @@ def quantize(onnx_model_path: Path) -> Path: Returns: The Path generated for the quantized """ - try: - ensure_onnxruntime_installed() import onnx - from onnxruntime import __version__ as ort_version from onnxruntime.quantization import quantize, QuantizationMode - print(f"Found ONNX: {onnx.__version__}") - print(f"Found ONNXRuntime: {ort_version}") - onnx_model = onnx.load(onnx_model_path.as_posix()) + + # Discussed with @yufenglee from ONNX runtime, this will be address in the next release of onnxruntime + print( + "As of onnxruntime 1.4.0, models larger than 2GB will fail to quantize due to protobuf constraint.\n" + "This limitation will be removed in the next release of onnxruntime." + ) + quantized_model = quantize( model=onnx_model, quantization_mode=QuantizationMode.IntegerOps, force_fusions=True, symmetric_weight=True, ) @@ -357,11 +382,11 @@ def quantize(onnx_model_path: Path) -> Path: quantized_model_path = generate_identified_filename(onnx_model_path, "-quantized") # Save model - print(f"Storing quantized model at {quantized_model_path}") - onnx.save(quantized_model, quantized_model_path.as_posix()) + print(f"Quantized model has been written at {quantized_model_path}: \N{heavy check mark}") + onnx.save_model(quantized_model, quantized_model_path.as_posix()) return quantized_model_path - except ImportError as ie: + except Exception as ie: print(f"Error while quantizing the model:\n{str(ie)}") @@ -369,7 +394,7 @@ def verify(path: Path): from onnxruntime import InferenceSession, SessionOptions from onnxruntime.capi.onnxruntime_pybind11_state import RuntimeException - print(f"Checking ONNX model loading from: {path}") + print(f"Checking ONNX model loading from: {path} ...") try: onnx_options = SessionOptions() _ = InferenceSession(path.as_posix(), onnx_options, providers=["CPUExecutionProvider"]) @@ -386,6 +411,7 @@ if __name__ == "__main__": args.output = Path(args.output).absolute() try: + print("\n====== Converting model to ONNX ======") # Convert convert( args.framework, @@ -398,12 +424,34 @@ if __name__ == "__main__": ) if args.quantize: - args.quantized_output = quantize(args.output) + # Ensure requirements for quantization on onnxruntime is met + check_onnxruntime_requirements(ORT_QUANTIZE_MINIMUM_VERSION) + + # onnxruntime optimizations doesn't provide the same level of performances on TensorFlow than PyTorch + if args.framework == "tf": + print( + "\t Using TensorFlow might not provide the same optimization level compared to PyTorch.\n" + "\t For TensorFlow users you can try optimizing the model directly through onnxruntime_tools.\n" + "\t For more information, please refer to the onnxruntime documentation:\n" + "\t\thttps://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers\n" + ) + + print("\n====== Optimizing ONNX model ======") + + # Quantization works best when using the optimized version of the model + args.optimized_output = optimize(args.output) + + # Do the quantization on the right graph + args.quantized_output = quantize(args.optimized_output) # And verify if args.check_loading: + print("\n====== Check exported ONNX model(s) ======") verify(args.output) + if hasattr(args, "optimized_output"): + verify(args.optimized_output) + if hasattr(args, "quantized_output"): verify(args.quantized_output)