From 4bfe75bd08410ac615687658da69ca688228f125 Mon Sep 17 00:00:00 2001 From: Michael Benayoun Date: Wed, 2 Mar 2022 04:03:14 -0500 Subject: [PATCH] M2M100 support for ONNX export (#15193) * Add M2M100 support for ONNX export * Delete useless imports * Add M2M100 to tests * Fix protobuf issue --- docs/source/serialization.mdx | 29 ++-- src/transformers/models/m2m_100/__init__.py | 4 +- .../models/m2m_100/configuration_m2m_100.py | 129 ++++++++++++++++++ src/transformers/onnx/convert.py | 39 ++++-- src/transformers/onnx/features.py | 4 + tests/onnx/test_onnx_v2.py | 1 + 6 files changed, 177 insertions(+), 29 deletions(-) diff --git a/docs/source/serialization.mdx b/docs/source/serialization.mdx index 5eff7580d9..de1675ee44 100644 --- a/docs/source/serialization.mdx +++ b/docs/source/serialization.mdx @@ -55,6 +55,7 @@ Ready-made configurations include the following architectures: - GPT Neo - I-BERT - LayoutLM +- M2M100 - Marian - mBART - OpenAI GPT-2 @@ -584,12 +585,12 @@ traced_model(tokens_tensor, segments_tensors) ### Deploying HuggingFace TorchScript models on AWS using the Neuron SDK -AWS introduced the [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) -instance family for low cost, high performance machine learning inference in the cloud. -The Inf1 instances are powered by the AWS Inferentia chip, a custom-built hardware accelerator, -specializing in deep learning inferencing workloads. -[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) -is the SDK for Inferentia that supports tracing and optimizing transformers models for +AWS introduced the [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) +instance family for low cost, high performance machine learning inference in the cloud. +The Inf1 instances are powered by the AWS Inferentia chip, a custom-built hardware accelerator, +specializing in deep learning inferencing workloads. +[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) +is the SDK for Inferentia that supports tracing and optimizing transformers models for deployment on Inf1. The Neuron SDK provides: @@ -600,13 +601,13 @@ deployment on Inf1. The Neuron SDK provides: #### Implications -Transformers Models based on the [BERT (Bidirectional Encoder Representations from Transformers)](https://huggingface.co/docs/transformers/master/model_doc/bert) +Transformers Models based on the [BERT (Bidirectional Encoder Representations from Transformers)](https://huggingface.co/docs/transformers/master/model_doc/bert) architecture, or its variants such as [distilBERT](https://huggingface.co/docs/transformers/master/model_doc/distilbert) - and [roBERTa](https://huggingface.co/docs/transformers/master/model_doc/roberta) - will run best on Inf1 for non-generative tasks such as Extractive Question Answering, + and [roBERTa](https://huggingface.co/docs/transformers/master/model_doc/roberta) + will run best on Inf1 for non-generative tasks such as Extractive Question Answering, Sequence Classification, Token Classification. Alternatively, text generation -tasks can be adapted to run on Inf1, according to this [AWS Neuron MarianMT tutorial](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html). -More information about models that can be converted out of the box on Inferentia can be +tasks can be adapted to run on Inf1, according to this [AWS Neuron MarianMT tutorial](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html). +More information about models that can be converted out of the box on Inferentia can be found in the [Model Architecture Fit section of the Neuron documentation](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia). #### Dependencies @@ -618,8 +619,8 @@ Using AWS Neuron to convert models requires the following dependencies and envir #### Converting a Model for AWS Neuron -Using the same script as in [Using TorchScript in Python](https://huggingface.co/docs/transformers/master/en/serialization#using-torchscript-in-python) -to trace a "BertModel", you import `torch.neuron` framework extension to access +Using the same script as in [Using TorchScript in Python](https://huggingface.co/docs/transformers/master/en/serialization#using-torchscript-in-python) +to trace a "BertModel", you import `torch.neuron` framework extension to access the components of the Neuron SDK through a Python API. ```python @@ -643,5 +644,5 @@ torch.neuron.trace(model, [token_tensor, segments_tensors]) This change enables Neuron SDK to trace the model and optimize it to run in Inf1 instances. -To learn more about AWS Neuron SDK features, tools, example tutorials and latest updates, +To learn more about AWS Neuron SDK features, tools, example tutorials and latest updates, please see the [AWS NeuronSDK documentation](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html). diff --git a/src/transformers/models/m2m_100/__init__.py b/src/transformers/models/m2m_100/__init__.py index 6e79e9791b..0f24fd3981 100644 --- a/src/transformers/models/m2m_100/__init__.py +++ b/src/transformers/models/m2m_100/__init__.py @@ -21,7 +21,7 @@ from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_availab _import_structure = { - "configuration_m2m_100": ["M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP", "M2M100Config"], + "configuration_m2m_100": ["M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP", "M2M100Config", "M2M100OnnxConfig"], "tokenization_m2m_100": ["M2M100Tokenizer"], } @@ -36,7 +36,7 @@ if is_torch_available(): if TYPE_CHECKING: - from .configuration_m2m_100 import M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP, M2M100Config + from .configuration_m2m_100 import M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP, M2M100Config, M2M100OnnxConfig from .tokenization_m2m_100 import M2M100Tokenizer if is_torch_available(): diff --git a/src/transformers/models/m2m_100/configuration_m2m_100.py b/src/transformers/models/m2m_100/configuration_m2m_100.py index a383be9f25..62a63d248b 100644 --- a/src/transformers/models/m2m_100/configuration_m2m_100.py +++ b/src/transformers/models/m2m_100/configuration_m2m_100.py @@ -13,8 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. """ M2M100 model configuration""" +from collections import OrderedDict +from typing import Any, Mapping, Optional +from ... import PreTrainedTokenizer from ...configuration_utils import PretrainedConfig +from ...file_utils import TensorType, is_torch_available +from ...onnx import OnnxConfig, OnnxSeq2SeqConfigWithPast +from ...onnx.utils import compute_effective_axis_dimension from ...utils import logging @@ -153,3 +159,126 @@ class M2M100Config(PretrainedConfig): decoder_start_token_id=decoder_start_token_id, **kwargs, ) + + +class M2M100OnnxConfig(OnnxSeq2SeqConfigWithPast): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + common_inputs = OrderedDict( + [ + ("input_ids", {0: "batch", 1: "encoder_sequence"}), + ("attention_mask", {0: "batch", 1: "encoder_sequence"}), + ] + ) + + if self.use_past: + common_inputs["decoder_input_ids"] = {0: "batch"} + common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"} + else: + common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"} + common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"} + + if self.use_past: + self.fill_with_past_key_values_(common_inputs, direction="inputs") + return common_inputs + + # Copied from BartOnnxConfig._generate_dummy_inputs_for_sequence_classification_and_question_answering + # A better name would be _generate_dummy_inputs_for_encoder_and_decoder because sequence classification and question + # answering are not supported for M2M100, but this name is preserved to be able to check that the copy matches what + # was done for BART so that it can be updated if need be. + def _generate_dummy_inputs_for_sequence_classification_and_question_answering( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + # Copied from OnnxConfig.generate_dummy_inputs + # Did not use super(OnnxConfigWithPast, self).generate_dummy_inputs for code clarity. + # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX + batch_size = compute_effective_axis_dimension( + batch_size, fixed_dimension=OnnxConfig.DEFAULT_FIXED_BATCH, num_token_to_add=0 + ) + + # If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX + token_to_add = tokenizer.num_special_tokens_to_add(is_pair) + seq_length = compute_effective_axis_dimension( + seq_length, fixed_dimension=OnnxConfig.DEFAULT_FIXED_SEQUENCE, num_token_to_add=token_to_add + ) + + # Generate dummy inputs according to compute batch and sequence + dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size + common_inputs = dict(tokenizer(dummy_input, return_tensors=framework)) + return common_inputs + + # Copied from transformers.models.bart.configuration_bart.BartOnnxConfig._generate_dummy_inputs_for_default_and_seq2seq_lm + def _generate_dummy_inputs_for_default_and_seq2seq_lm( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering( + tokenizer, batch_size, seq_length, is_pair, framework + ) + + # Generate decoder inputs + decoder_seq_length = seq_length if not self.use_past else 1 + decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering( + tokenizer, batch_size, decoder_seq_length, is_pair, framework + ) + decoder_inputs = {f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()} + common_inputs = dict(**encoder_inputs, **decoder_inputs) + + if self.use_past: + if not is_torch_available(): + raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.") + else: + import torch + batch, encoder_seq_length = common_inputs["input_ids"].shape + decoder_seq_length = common_inputs["decoder_input_ids"].shape[1] + num_encoder_attention_heads, num_decoder_attention_heads = self.num_attention_heads + encoder_shape = ( + batch, + num_encoder_attention_heads, + encoder_seq_length, + self._config.hidden_size // num_encoder_attention_heads, + ) + decoder_past_length = decoder_seq_length + 3 + decoder_shape = ( + batch, + num_decoder_attention_heads, + decoder_past_length, + self._config.hidden_size // num_decoder_attention_heads, + ) + + common_inputs["decoder_attention_mask"] = torch.cat( + [common_inputs["decoder_attention_mask"], torch.ones(batch, decoder_past_length)], dim=1 + ) + + common_inputs["past_key_values"] = [] + # If the number of encoder and decoder layers are present in the model configuration, both are considered + num_encoder_layers, num_decoder_layers = self.num_layers + min_num_layers = min(num_encoder_layers, num_decoder_layers) + max_num_layers = max(num_encoder_layers, num_decoder_layers) - min_num_layers + remaining_side_name = "encoder" if num_encoder_layers > num_decoder_layers else "decoder" + + for _ in range(min_num_layers): + common_inputs["past_key_values"].append( + ( + torch.zeros(decoder_shape), + torch.zeros(decoder_shape), + torch.zeros(encoder_shape), + torch.zeros(encoder_shape), + ) + ) + # TODO: test this. + shape = encoder_shape if remaining_side_name == "encoder" else decoder_shape + for _ in range(min_num_layers, max_num_layers): + common_inputs["past_key_values"].append((torch.zeros(shape), torch.zeros(shape))) + return common_inputs + + generate_dummy_inputs = _generate_dummy_inputs_for_default_and_seq2seq_lm diff --git a/src/transformers/onnx/convert.py b/src/transformers/onnx/convert.py index f66c0b61dd..dfca8d3663 100644 --- a/src/transformers/onnx/convert.py +++ b/src/transformers/onnx/convert.py @@ -117,21 +117,34 @@ def export_pytorch( # PyTorch deprecated the `enable_onnx_checker` and `use_external_data_format` arguments in v1.11, # so we check the torch version for backwards compatibility - if parse(torch.__version__) <= parse("1.10.99"): + if parse(torch.__version__) < parse("1.10"): # export can work with named args but the dict containing named args # has to be the last element of the args tuple. - onnx_export( - model, - (model_inputs,), - f=output.as_posix(), - input_names=list(config.inputs.keys()), - output_names=onnx_outputs, - dynamic_axes={name: axes for name, axes in chain(config.inputs.items(), config.outputs.items())}, - do_constant_folding=True, - use_external_data_format=config.use_external_data_format(model.num_parameters()), - enable_onnx_checker=True, - opset_version=opset, - ) + try: + onnx_export( + model, + (model_inputs,), + f=output.as_posix(), + input_names=list(config.inputs.keys()), + output_names=onnx_outputs, + dynamic_axes={ + name: axes for name, axes in chain(config.inputs.items(), config.outputs.items()) + }, + do_constant_folding=True, + use_external_data_format=config.use_external_data_format(model.num_parameters()), + enable_onnx_checker=True, + opset_version=opset, + ) + except RuntimeError as err: + message = str(err) + if ( + message + == "Exporting model exceed maximum protobuf size of 2GB. Please call torch.onnx.export without setting use_external_data_format parameter." + ): + message = "Exporting model exceed maximum protobuf size of 2GB. Please call torch.onnx.export without setting use_external_data_format parameter or try with torch 1.10+." + raise RuntimeError(message) + else: + raise err else: onnx_export( model, diff --git a/src/transformers/onnx/features.py b/src/transformers/onnx/features.py index 58db3ed3f4..fbf1703578 100644 --- a/src/transformers/onnx/features.py +++ b/src/transformers/onnx/features.py @@ -12,6 +12,7 @@ from ..models.gpt2 import GPT2OnnxConfig from ..models.gpt_neo import GPTNeoOnnxConfig from ..models.ibert import IBertOnnxConfig from ..models.layoutlm import LayoutLMOnnxConfig +from ..models.m2m_100 import M2M100OnnxConfig from ..models.marian import MarianOnnxConfig from ..models.mbart import MBartOnnxConfig from ..models.roberta import RobertaOnnxConfig @@ -184,6 +185,9 @@ class FeaturesManager: "causal-lm-with-past", onnx_config_cls=MarianOnnxConfig, ), + "m2m-100": supported_features_mapping( + "default", "default-with-past", "seq2seq-lm", "seq2seq-lm-with-past", onnx_config_cls=M2M100OnnxConfig + ), "roberta": supported_features_mapping( "default", "masked-lm", diff --git a/tests/onnx/test_onnx_v2.py b/tests/onnx/test_onnx_v2.py index 0cd53f885a..00b5d3b6c3 100644 --- a/tests/onnx/test_onnx_v2.py +++ b/tests/onnx/test_onnx_v2.py @@ -190,6 +190,7 @@ PYTORCH_EXPORT_SEQ2SEQ_WITH_PAST_MODELS = { ("mbart", "sshleifer/tiny-mbart"), ("t5", "t5-small"), ("marian", "Helsinki-NLP/opus-mt-en-de"), + ("m2m-100", "facebook/m2m100_418M"), } TENSORFLOW_EXPORT_DEFAULT_MODELS = {