Added onnx config whisper (#19525)
* Added onnx config whisper * added whisper support onnx * add audio input data * added whisper support onnx * fixed the seqlength value * Updated the whisper onnx ocnfig * restore files to old version * removed attention mask from inputs * Updated get_dummy_input_onnxruntime docstring * Updated relative imports and token generation * update docstring
This commit is contained in:
@@ -100,6 +100,7 @@ Ready-made configurations include the following architectures:
|
|||||||
- Table Transformer
|
- Table Transformer
|
||||||
- Vision Encoder decoder
|
- Vision Encoder decoder
|
||||||
- ViT
|
- ViT
|
||||||
|
- Whisper
|
||||||
- XLM
|
- XLM
|
||||||
- XLM-RoBERTa
|
- XLM-RoBERTa
|
||||||
- XLM-RoBERTa-XL
|
- XLM-RoBERTa-XL
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tf_availabl
|
|||||||
|
|
||||||
|
|
||||||
_import_structure = {
|
_import_structure = {
|
||||||
"configuration_whisper": ["WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP", "WhisperConfig"],
|
"configuration_whisper": ["WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP", "WhisperConfig", "WhisperOnnxConfig"],
|
||||||
"feature_extraction_whisper": ["WhisperFeatureExtractor"],
|
"feature_extraction_whisper": ["WhisperFeatureExtractor"],
|
||||||
"processing_whisper": ["WhisperProcessor"],
|
"processing_whisper": ["WhisperProcessor"],
|
||||||
"tokenization_whisper": ["WhisperTokenizer"],
|
"tokenization_whisper": ["WhisperTokenizer"],
|
||||||
@@ -55,7 +55,7 @@ else:
|
|||||||
]
|
]
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from .configuration_whisper import WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP, WhisperConfig
|
from .configuration_whisper import WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP, WhisperConfig, WhisperOnnxConfig
|
||||||
from .feature_extraction_whisper import WhisperFeatureExtractor
|
from .feature_extraction_whisper import WhisperFeatureExtractor
|
||||||
from .processing_whisper import WhisperProcessor
|
from .processing_whisper import WhisperProcessor
|
||||||
from .tokenization_whisper import WhisperTokenizer
|
from .tokenization_whisper import WhisperTokenizer
|
||||||
|
|||||||
@@ -14,10 +14,19 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
""" Whisper model configuration"""
|
""" Whisper model configuration"""
|
||||||
|
|
||||||
|
from collections import OrderedDict
|
||||||
|
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
|
||||||
|
|
||||||
from ...configuration_utils import PretrainedConfig
|
from ...configuration_utils import PretrainedConfig
|
||||||
|
from ...onnx import OnnxConfig, OnnxSeq2SeqConfigWithPast
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
|
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from ...feature_extraction_utils import FeatureExtractionMixin
|
||||||
|
from ...tokenization_utils_base import PreTrainedTokenizerBase
|
||||||
|
from ...utils import TensorType
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||||
@@ -214,3 +223,59 @@ class WhisperConfig(PretrainedConfig):
|
|||||||
begin_suppress_tokens=begin_suppress_tokens,
|
begin_suppress_tokens=begin_suppress_tokens,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class WhisperOnnxConfig(OnnxSeq2SeqConfigWithPast):
|
||||||
|
@property
|
||||||
|
def inputs(self) -> Mapping[str, Mapping[int, str]]:
|
||||||
|
common_inputs = OrderedDict(
|
||||||
|
[
|
||||||
|
("input_features", {0: "batch", 1: "feature_size", 2: "encoder_sequence"}),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
if self.use_past:
|
||||||
|
common_inputs["decoder_input_ids"] = {0: "batch"}
|
||||||
|
else:
|
||||||
|
common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
|
||||||
|
|
||||||
|
if self.use_past:
|
||||||
|
self.fill_with_past_key_values_(common_inputs, direction="inputs")
|
||||||
|
|
||||||
|
return common_inputs
|
||||||
|
|
||||||
|
def generate_dummy_inputs(
|
||||||
|
self,
|
||||||
|
preprocessor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"],
|
||||||
|
batch_size: int = -1,
|
||||||
|
seq_length: int = -1,
|
||||||
|
is_pair: bool = False,
|
||||||
|
framework: Optional["TensorType"] = None,
|
||||||
|
sampling_rate: int = 22050,
|
||||||
|
time_duration: float = 5.0,
|
||||||
|
frequency: int = 220,
|
||||||
|
) -> Mapping[str, Any]:
|
||||||
|
dummy_inputs = OrderedDict()
|
||||||
|
encoder_inputs = OnnxConfig.generate_dummy_inputs(
|
||||||
|
self,
|
||||||
|
preprocessor=preprocessor.feature_extractor,
|
||||||
|
batch_size=batch_size,
|
||||||
|
framework=framework,
|
||||||
|
sampling_rate=sampling_rate,
|
||||||
|
time_duration=time_duration,
|
||||||
|
frequency=frequency,
|
||||||
|
)
|
||||||
|
decoder_inputs = super().generate_dummy_inputs(
|
||||||
|
preprocessor.tokenizer, batch_size, seq_length, is_pair, framework
|
||||||
|
)
|
||||||
|
|
||||||
|
dummy_inputs["input_features"] = encoder_inputs.pop("input_features")
|
||||||
|
dummy_inputs["decoder_input_ids"] = decoder_inputs.pop("decoder_input_ids")
|
||||||
|
|
||||||
|
if "past_key_values" in decoder_inputs:
|
||||||
|
dummy_inputs["past_key_values"] = decoder_inputs.pop("past_key_values")
|
||||||
|
|
||||||
|
return dummy_inputs
|
||||||
|
|
||||||
|
@property
|
||||||
|
def atol_for_validation(self) -> float:
|
||||||
|
return 1e-3
|
||||||
|
|||||||
@@ -104,6 +104,7 @@ class OnnxConfig(ABC):
|
|||||||
"sequence-classification": OrderedDict({"logits": {0: "batch"}}),
|
"sequence-classification": OrderedDict({"logits": {0: "batch"}}),
|
||||||
"token-classification": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
|
"token-classification": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
|
||||||
"vision2seq-lm": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
|
"vision2seq-lm": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
|
||||||
|
"speech2seq-lm": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, config: "PretrainedConfig", task: str = "default", patching_specs: List[PatchingSpec] = None):
|
def __init__(self, config: "PretrainedConfig", task: str = "default", patching_specs: List[PatchingSpec] = None):
|
||||||
@@ -262,6 +263,19 @@ class OnnxConfig(ABC):
|
|||||||
images.append(Image.fromarray(data.astype("uint8")).convert("RGB"))
|
images.append(Image.fromarray(data.astype("uint8")).convert("RGB"))
|
||||||
return images
|
return images
|
||||||
|
|
||||||
|
def _generate_dummy_audio(
|
||||||
|
self, batch_size: int = 2, sampling_rate: int = 22050, time_duration: float = 5.0, frequency: int = 220
|
||||||
|
):
|
||||||
|
audio_data = []
|
||||||
|
for _ in range(batch_size):
|
||||||
|
# time variable
|
||||||
|
t = np.linspace(0, time_duration, int(time_duration * sampling_rate), endpoint=False)
|
||||||
|
|
||||||
|
# generate pure sine wave at `frequency` Hz
|
||||||
|
audio_data.append(0.5 * np.sin(2 * np.pi * frequency * t))
|
||||||
|
|
||||||
|
return audio_data
|
||||||
|
|
||||||
def generate_dummy_inputs(
|
def generate_dummy_inputs(
|
||||||
self,
|
self,
|
||||||
preprocessor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"],
|
preprocessor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"],
|
||||||
@@ -273,6 +287,9 @@ class OnnxConfig(ABC):
|
|||||||
num_channels: int = 3,
|
num_channels: int = 3,
|
||||||
image_width: int = 40,
|
image_width: int = 40,
|
||||||
image_height: int = 40,
|
image_height: int = 40,
|
||||||
|
sampling_rate: int = 22050,
|
||||||
|
time_duration: float = 5.0,
|
||||||
|
frequency: int = 220,
|
||||||
tokenizer: "PreTrainedTokenizerBase" = None,
|
tokenizer: "PreTrainedTokenizerBase" = None,
|
||||||
) -> Mapping[str, Any]:
|
) -> Mapping[str, Any]:
|
||||||
"""
|
"""
|
||||||
@@ -297,6 +314,12 @@ class OnnxConfig(ABC):
|
|||||||
The width of the generated images.
|
The width of the generated images.
|
||||||
image_height (`int`, *optional*, defaults to 40):
|
image_height (`int`, *optional*, defaults to 40):
|
||||||
The height of the generated images.
|
The height of the generated images.
|
||||||
|
sampling_rate (`int`, *optional* defaults to 22050)
|
||||||
|
The sampling rate for audio data generation.
|
||||||
|
time_duration (`float`, *optional* defaults to 5.0)
|
||||||
|
Total seconds of sampling for audio data generation.
|
||||||
|
frequency (`int`, *optional* defaults to 220)
|
||||||
|
The desired natural frequency of generated audio.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Mapping[str, Tensor] holding the kwargs to provide to the model's forward function
|
Mapping[str, Tensor] holding the kwargs to provide to the model's forward function
|
||||||
@@ -325,7 +348,12 @@ class OnnxConfig(ABC):
|
|||||||
seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
|
seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
|
||||||
)
|
)
|
||||||
# Generate dummy inputs according to compute batch and sequence
|
# Generate dummy inputs according to compute batch and sequence
|
||||||
dummy_input = [" ".join([preprocessor.unk_token]) * seq_length] * batch_size
|
input_token = (
|
||||||
|
preprocessor.unk_token
|
||||||
|
if (preprocessor.unk_token is not None and len(preprocessor.unk_token) > 0)
|
||||||
|
else "0"
|
||||||
|
)
|
||||||
|
dummy_input = [" ".join([input_token]) * seq_length] * batch_size
|
||||||
if self.task == "multiple-choice":
|
if self.task == "multiple-choice":
|
||||||
# If dynamic axis (-1) we forward with a fixed dimension of 4 candidate answers to avoid optimizations
|
# If dynamic axis (-1) we forward with a fixed dimension of 4 candidate answers to avoid optimizations
|
||||||
# made by ONNX
|
# made by ONNX
|
||||||
@@ -345,11 +373,32 @@ class OnnxConfig(ABC):
|
|||||||
batch_size = compute_effective_axis_dimension(batch_size, fixed_dimension=OnnxConfig.default_fixed_batch)
|
batch_size = compute_effective_axis_dimension(batch_size, fixed_dimension=OnnxConfig.default_fixed_batch)
|
||||||
dummy_input = self._generate_dummy_images(batch_size, num_channels, image_height, image_width)
|
dummy_input = self._generate_dummy_images(batch_size, num_channels, image_height, image_width)
|
||||||
return dict(preprocessor(images=dummy_input, return_tensors=framework))
|
return dict(preprocessor(images=dummy_input, return_tensors=framework))
|
||||||
|
elif (
|
||||||
|
isinstance(preprocessor, FeatureExtractionMixin) and preprocessor.model_input_names[0] == "input_features"
|
||||||
|
):
|
||||||
|
# If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
|
||||||
|
batch_size = compute_effective_axis_dimension(batch_size, fixed_dimension=OnnxConfig.default_fixed_batch)
|
||||||
|
dummy_input = self._generate_dummy_audio(batch_size, sampling_rate, time_duration, frequency)
|
||||||
|
return dict(preprocessor(dummy_input, return_tensors=framework))
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Unable to generate dummy inputs for the model. Please provide a tokenizer or a preprocessor."
|
"Unable to generate dummy inputs for the model. Please provide a tokenizer or a preprocessor."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def generate_dummy_inputs_onnxruntime(self, reference_model_inputs: Mapping[str, Any]) -> Mapping[str, Any]:
|
||||||
|
"""
|
||||||
|
Generate inputs for ONNX Runtime using the reference model inputs. Override this to run inference with seq2seq
|
||||||
|
models which have the encoder and decoder exported as separate ONNX files.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
reference_model_inputs ([`Mapping[str, Tensor]`):
|
||||||
|
Reference inputs for the model.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`Mapping[str, Tensor]`: The mapping holding the kwargs to provide to the model's forward function
|
||||||
|
"""
|
||||||
|
return reference_model_inputs
|
||||||
|
|
||||||
def patch_ops(self):
|
def patch_ops(self):
|
||||||
for spec in self._patching_specs:
|
for spec in self._patching_specs:
|
||||||
custom_op = spec.custom_op if spec.op_wrapper is None else spec.op_wrapper(spec.custom_op)
|
custom_op = spec.custom_op if spec.op_wrapper is None else spec.op_wrapper(spec.custom_op)
|
||||||
|
|||||||
@@ -145,7 +145,21 @@ def export_pytorch(
|
|||||||
device = torch.device(device)
|
device = torch.device(device)
|
||||||
if device.type == "cuda" and torch.cuda.is_available():
|
if device.type == "cuda" and torch.cuda.is_available():
|
||||||
model.to(device)
|
model.to(device)
|
||||||
model_inputs = dict((k, v.to(device)) for k, v in model_inputs.items())
|
model_inputs_device = dict()
|
||||||
|
for k, v in model_inputs.items():
|
||||||
|
if isinstance(v, Tuple):
|
||||||
|
model_inputs_device[k] = tuple(
|
||||||
|
x.to(device) if isinstance(x, torch.Tensor) else None for x in v
|
||||||
|
)
|
||||||
|
elif isinstance(v, List):
|
||||||
|
model_inputs_device[k] = [
|
||||||
|
tuple(x.to(device) if isinstance(x, torch.Tensor) else None for x in t) for t in v
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
model_inputs_device[k] = v.to(device)
|
||||||
|
|
||||||
|
model_inputs = model_inputs_device
|
||||||
|
|
||||||
inputs_match, matched_inputs = ensure_model_and_config_inputs_match(model, model_inputs.keys())
|
inputs_match, matched_inputs = ensure_model_and_config_inputs_match(model, model_inputs.keys())
|
||||||
onnx_outputs = list(config.outputs.keys())
|
onnx_outputs = list(config.outputs.keys())
|
||||||
|
|
||||||
@@ -404,9 +418,12 @@ def validate_model_outputs(
|
|||||||
else:
|
else:
|
||||||
ref_outputs_dict[name] = value
|
ref_outputs_dict[name] = value
|
||||||
|
|
||||||
|
# Create onnxruntime inputs from the reference model inputs
|
||||||
|
reference_model_inputs_onnxruntime = config.generate_dummy_inputs_onnxruntime(reference_model_inputs)
|
||||||
|
|
||||||
# We flatten potential collection of inputs (i.e. past_keys)
|
# We flatten potential collection of inputs (i.e. past_keys)
|
||||||
onnx_inputs = {}
|
onnx_inputs = {}
|
||||||
for name, value in reference_model_inputs.items():
|
for name, value in reference_model_inputs_onnxruntime.items():
|
||||||
if isinstance(value, (list, tuple)):
|
if isinstance(value, (list, tuple)):
|
||||||
value = config.flatten_output_collection_property(name, value)
|
value = config.flatten_output_collection_property(name, value)
|
||||||
onnx_inputs.update({tensor_name: pt_tensor.numpy() for tensor_name, pt_tensor in value.items()})
|
onnx_inputs.update({tensor_name: pt_tensor.numpy() for tensor_name, pt_tensor in value.items()})
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ if is_torch_available():
|
|||||||
AutoModelForSemanticSegmentation,
|
AutoModelForSemanticSegmentation,
|
||||||
AutoModelForSeq2SeqLM,
|
AutoModelForSeq2SeqLM,
|
||||||
AutoModelForSequenceClassification,
|
AutoModelForSequenceClassification,
|
||||||
|
AutoModelForSpeechSeq2Seq,
|
||||||
AutoModelForTokenClassification,
|
AutoModelForTokenClassification,
|
||||||
AutoModelForVision2Seq,
|
AutoModelForVision2Seq,
|
||||||
)
|
)
|
||||||
@@ -100,6 +101,7 @@ class FeaturesManager:
|
|||||||
"masked-im": AutoModelForMaskedImageModeling,
|
"masked-im": AutoModelForMaskedImageModeling,
|
||||||
"semantic-segmentation": AutoModelForSemanticSegmentation,
|
"semantic-segmentation": AutoModelForSemanticSegmentation,
|
||||||
"vision2seq-lm": AutoModelForVision2Seq,
|
"vision2seq-lm": AutoModelForVision2Seq,
|
||||||
|
"speech2seq-lm": AutoModelForSpeechSeq2Seq,
|
||||||
}
|
}
|
||||||
if is_tf_available():
|
if is_tf_available():
|
||||||
_TASKS_TO_TF_AUTOMODELS = {
|
_TASKS_TO_TF_AUTOMODELS = {
|
||||||
@@ -492,6 +494,13 @@ class FeaturesManager:
|
|||||||
"vit": supported_features_mapping(
|
"vit": supported_features_mapping(
|
||||||
"default", "image-classification", "masked-im", onnx_config_cls="models.vit.ViTOnnxConfig"
|
"default", "image-classification", "masked-im", onnx_config_cls="models.vit.ViTOnnxConfig"
|
||||||
),
|
),
|
||||||
|
"whisper": supported_features_mapping(
|
||||||
|
"default",
|
||||||
|
"default-with-past",
|
||||||
|
"speech2seq-lm",
|
||||||
|
"speech2seq-lm-with-past",
|
||||||
|
onnx_config_cls="models.whisper.WhisperOnnxConfig",
|
||||||
|
),
|
||||||
"xlm": supported_features_mapping(
|
"xlm": supported_features_mapping(
|
||||||
"default",
|
"default",
|
||||||
"masked-lm",
|
"masked-lm",
|
||||||
|
|||||||
@@ -218,6 +218,7 @@ PYTORCH_EXPORT_MODELS = {
|
|||||||
("yolos", "hustvl/yolos-tiny"),
|
("yolos", "hustvl/yolos-tiny"),
|
||||||
("segformer", "nvidia/segformer-b0-finetuned-ade-512-512"),
|
("segformer", "nvidia/segformer-b0-finetuned-ade-512-512"),
|
||||||
("swin", "microsoft/swin-tiny-patch4-window7-224"),
|
("swin", "microsoft/swin-tiny-patch4-window7-224"),
|
||||||
|
("whisper", "openai/whisper-tiny.en"),
|
||||||
}
|
}
|
||||||
|
|
||||||
PYTORCH_EXPORT_ENCODER_DECODER_MODELS = {
|
PYTORCH_EXPORT_ENCODER_DECODER_MODELS = {
|
||||||
@@ -398,7 +399,7 @@ class OnnxExportTestCaseV2(TestCase):
|
|||||||
preprocessor = AutoTokenizer.from_pretrained(model_name)
|
preprocessor = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
|
||||||
with NamedTemporaryFile("w") as decoder_output:
|
with NamedTemporaryFile("w") as decoder_output:
|
||||||
onnx_inputs, onnx_outputs = export(
|
_, onnx_outputs = export(
|
||||||
preprocessor,
|
preprocessor,
|
||||||
decoder_model,
|
decoder_model,
|
||||||
decoder_onnx_config,
|
decoder_onnx_config,
|
||||||
|
|||||||
Reference in New Issue
Block a user