Speech2TextTransformer (#10175)
* s2t * fix config * conversion script * fix import * add tokenizer * fix tok init * fix tokenizer * first version working * fix embeds * fix lm head * remove extra heads * fix convert script * handle encoder attn mask * style * better enc attn mask * override _prepare_attention_mask_for_generation * handle attn_maks in encoder and decoder * input_ids => input_features * enable use_cache * remove old code * expand embeddings if needed * remove logits bias * masked_lm_loss => loss * hack tokenizer to support feature processing * fix model_input_names * style * fix error message * doc * remove inputs_embeds * remove input_embeds * remove unnecessary docstring * quality * SpeechToText => Speech2Text * style * remove shared_embeds * subsample => conv * remove Speech2TextTransformerDecoderWrapper * update output_lengths formula * fix table * remove max_position_embeddings * update conversion scripts * add possibility to do upper case for now * add FeatureExtractor and Processor * add tests for extractor * require_torch_audio => require_torchaudio * add processor test * update import * remove classification head * attention mask is now 1D * update docstrings * attention mask should be of type long * handle attention mask from generate * alwyas return attention_mask * fix test * style * doc * Speech2TextTransformer => Speech2Text * Speech2TextTransformerConfig => Speech2TextConfig * remove dummy_inputs * nit * style * multilinguial tok * fix tokenizer * add tgt_lang setter * save lang_codes * fix tokenizer * add forced_bos_token_id to tokenizer * apply review suggestions * add torchaudio to extra deps * add speech deps to CI * fix dep * add libsndfile to ci * libsndfile1 * add speech to extras all * libsndfile1 -> libsndfile1 * libsndfile * libsndfile1-dev * apt update * add sudo to install * update deps table * install libsndfile1-dev on CI * tuple to list * init conv layer * add model tests * quality * add integration tests * skip_special_tokens * add speech_to_text_transformer in toctree * fix tokenizer * fix fp16 tests * add tokenizer tests * fix copyright * input_values => input_features * doc * add model in readme * doc * change checkpoint names * fix copyright * fix code example * add max_model_input_sizes in tokenizer * fix integration tests * add do_lower_case to tokenizer * remove clamp trick * fix "Add modeling imports here" * fix copyrights * fix tests * SpeechToTextTransformer => SpeechToText * fix naming * fix table formatting * fix typo * style * fix typos * remove speech dep from extras[testing] * fix copies * rename doc file, * put imports under is_torch_available * run feat extract tests when torch is available * dummy objects for processor and extractor * fix imports in tests * fix import in modeling test * fxi imports * fix torch import * fix imports again * fix positional embeddings * fix typo in import * adapt new extractor refactor * style * fix torchscript test * doc * doc * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> * fix docs, copied from, style * fix docstring * handle imports * remove speech from all extra deps * remove s2t from seq2seq lm mapping * better names * skip training tests * add install instructions * List => Tuple * doc * fix conversion script * fix urls * add instruction for libsndfile * fix fp16 test Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
@@ -135,6 +135,11 @@ _import_structure = {
|
||||
"Wav2Vec2Processor",
|
||||
],
|
||||
"models.m2m_100": ["M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP", "M2M100Config", "M2M100Tokenizer"],
|
||||
"models.speech_to_text": [
|
||||
"SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP",
|
||||
"Speech2TextConfig",
|
||||
"Speech2TextFeatureExtractor",
|
||||
],
|
||||
"models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"],
|
||||
"models.albert": ["ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "AlbertConfig"],
|
||||
"models.auto": [
|
||||
@@ -275,6 +280,8 @@ if is_sentencepiece_available():
|
||||
_import_structure["models.mt5"].append("MT5Tokenizer")
|
||||
_import_structure["models.pegasus"].append("PegasusTokenizer")
|
||||
_import_structure["models.reformer"].append("ReformerTokenizer")
|
||||
_import_structure["models.speech_to_text"].append("Speech2TextTokenizer")
|
||||
_import_structure["models.speech_to_text"].append("Speech2TextProcessor")
|
||||
_import_structure["models.t5"].append("T5Tokenizer")
|
||||
_import_structure["models.xlm_prophetnet"].append("XLMProphetNetTokenizer")
|
||||
_import_structure["models.xlm_roberta"].append("XLMRobertaTokenizer")
|
||||
@@ -377,6 +384,14 @@ if is_torch_available():
|
||||
_import_structure["modeling_utils"] = ["Conv1D", "PreTrainedModel", "apply_chunking_to_forward", "prune_layer"]
|
||||
# PyTorch models structure
|
||||
|
||||
_import_structure["models.speech_to_text"].extend(
|
||||
[
|
||||
"SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
"Speech2TextForConditionalGeneration",
|
||||
"Speech2TextModel",
|
||||
]
|
||||
)
|
||||
|
||||
_import_structure["models.wav2vec2"].extend(
|
||||
[
|
||||
"WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
@@ -1379,6 +1394,11 @@ if TYPE_CHECKING:
|
||||
from .models.reformer import REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, ReformerConfig
|
||||
from .models.retribert import RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RetriBertConfig, RetriBertTokenizer
|
||||
from .models.roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig, RobertaTokenizer
|
||||
from .models.speech_to_text import (
|
||||
SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
Speech2TextConfig,
|
||||
Speech2TextFeatureExtractor,
|
||||
)
|
||||
from .models.squeezebert import SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, SqueezeBertConfig, SqueezeBertTokenizer
|
||||
from .models.t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
|
||||
from .models.tapas import TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP, TapasConfig, TapasTokenizer
|
||||
@@ -1461,6 +1481,7 @@ if TYPE_CHECKING:
|
||||
from .models.mt5 import MT5Tokenizer
|
||||
from .models.pegasus import PegasusTokenizer
|
||||
from .models.reformer import ReformerTokenizer
|
||||
from .models.speech_to_text import Speech2TextProcessor, Speech2TextTokenizer
|
||||
from .models.t5 import T5Tokenizer
|
||||
from .models.xlm_prophetnet import XLMProphetNetTokenizer
|
||||
from .models.xlm_roberta import XLMRobertaTokenizer
|
||||
@@ -1862,6 +1883,11 @@ if TYPE_CHECKING:
|
||||
RobertaForTokenClassification,
|
||||
RobertaModel,
|
||||
)
|
||||
from .models.speech_to_text import (
|
||||
SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
Speech2TextForConditionalGeneration,
|
||||
Speech2TextModel,
|
||||
)
|
||||
from .models.squeezebert import (
|
||||
SQUEEZEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
SqueezeBertForMaskedLM,
|
||||
|
||||
@@ -47,6 +47,7 @@ deps = {
|
||||
"timeout-decorator": "timeout-decorator",
|
||||
"tokenizers": "tokenizers>=0.10.1,<0.11",
|
||||
"torch": "torch>=1.0",
|
||||
"torchaudio": "torchaudio",
|
||||
"tqdm": "tqdm>=4.27",
|
||||
"unidic": "unidic>=1.0.2",
|
||||
"unidic_lite": "unidic_lite>=1.0.7",
|
||||
|
||||
@@ -177,6 +177,13 @@ try:
|
||||
except importlib_metadata.PackageNotFoundError:
|
||||
_soundfile_available = False
|
||||
|
||||
_torchaudio_available = importlib.util.find_spec("torchaudio")
|
||||
try:
|
||||
_torchaudio_version = importlib_metadata.version("torchaudio")
|
||||
logger.debug(f"Successfully imported soundfile version {_torchaudio_version}")
|
||||
except importlib_metadata.PackageNotFoundError:
|
||||
_torchaudio_available = False
|
||||
|
||||
|
||||
torch_cache_home = os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch"))
|
||||
old_default_cache_path = os.path.join(torch_cache_home, "transformers")
|
||||
@@ -364,6 +371,10 @@ def is_soundfile_availble():
|
||||
return _soundfile_available
|
||||
|
||||
|
||||
def is_torchaudio_available():
|
||||
return _torchaudio_available
|
||||
|
||||
|
||||
def torch_only_method(fn):
|
||||
def wrapper(*args, **kwargs):
|
||||
if not _torch_available:
|
||||
|
||||
@@ -384,7 +384,7 @@ class GenerationMixin:
|
||||
)
|
||||
if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id:
|
||||
return input_ids.ne(pad_token_id).long()
|
||||
return input_ids.new_ones(input_ids.shape)
|
||||
return input_ids.new_ones(input_ids.shape, dtype=torch.long)
|
||||
|
||||
def _prepare_encoder_decoder_kwargs_for_generation(
|
||||
self, input_ids: torch.LongTensor, model_kwargs
|
||||
@@ -402,8 +402,7 @@ class GenerationMixin:
|
||||
) -> torch.LongTensor:
|
||||
decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
|
||||
decoder_input_ids = (
|
||||
torch.ones((input_ids.shape[0], 1), dtype=input_ids.dtype, device=input_ids.device)
|
||||
* decoder_start_token_id
|
||||
torch.ones((input_ids.shape[0], 1), dtype=torch.long, device=input_ids.device) * decoder_start_token_id
|
||||
)
|
||||
return decoder_input_ids
|
||||
|
||||
|
||||
@@ -60,6 +60,7 @@ from . import (
|
||||
reformer,
|
||||
retribert,
|
||||
roberta,
|
||||
speech_to_text,
|
||||
squeezebert,
|
||||
t5,
|
||||
tapas,
|
||||
|
||||
@@ -58,6 +58,10 @@ from ..rag.configuration_rag import RagConfig
|
||||
from ..reformer.configuration_reformer import ReformerConfig
|
||||
from ..retribert.configuration_retribert import RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RetriBertConfig
|
||||
from ..roberta.configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig
|
||||
from ..speech_to_text.configuration_speech_to_text import (
|
||||
SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
Speech2TextConfig,
|
||||
)
|
||||
from ..squeezebert.configuration_squeezebert import SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, SqueezeBertConfig
|
||||
from ..t5.configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
|
||||
from ..tapas.configuration_tapas import TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP, TapasConfig
|
||||
@@ -76,6 +80,7 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
|
||||
(key, value)
|
||||
for pretrained_map in [
|
||||
# Add archive maps here
|
||||
SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
@@ -122,6 +127,7 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
|
||||
CONFIG_MAPPING = OrderedDict(
|
||||
[
|
||||
# Add configs here
|
||||
("speech_to_text", Speech2TextConfig),
|
||||
("wav2vec2", Wav2Vec2Config),
|
||||
("m2m_100", M2M100Config),
|
||||
("convbert", ConvBertConfig),
|
||||
@@ -174,6 +180,7 @@ CONFIG_MAPPING = OrderedDict(
|
||||
MODEL_NAMES_MAPPING = OrderedDict(
|
||||
[
|
||||
# Add full (and cased) model names here
|
||||
("speech_to_text", "Speech2Text"),
|
||||
("wav2vec2", "Wav2Vec2"),
|
||||
("m2m_100", "M2M100"),
|
||||
("convbert", "ConvBERT"),
|
||||
|
||||
@@ -66,8 +66,6 @@ from ..camembert.modeling_camembert import (
|
||||
CamembertForTokenClassification,
|
||||
CamembertModel,
|
||||
)
|
||||
|
||||
# Add modeling imports here
|
||||
from ..convbert.modeling_convbert import (
|
||||
ConvBertForMaskedLM,
|
||||
ConvBertForMultipleChoice,
|
||||
@@ -211,6 +209,7 @@ from ..roberta.modeling_roberta import (
|
||||
RobertaForTokenClassification,
|
||||
RobertaModel,
|
||||
)
|
||||
from ..speech_to_text.modeling_speech_to_text import Speech2TextForConditionalGeneration, Speech2TextModel
|
||||
from ..squeezebert.modeling_squeezebert import (
|
||||
SqueezeBertForMaskedLM,
|
||||
SqueezeBertForMultipleChoice,
|
||||
@@ -296,6 +295,7 @@ from .configuration_auto import (
|
||||
ReformerConfig,
|
||||
RetriBertConfig,
|
||||
RobertaConfig,
|
||||
Speech2TextConfig,
|
||||
SqueezeBertConfig,
|
||||
T5Config,
|
||||
TapasConfig,
|
||||
@@ -315,6 +315,7 @@ logger = logging.get_logger(__name__)
|
||||
MODEL_MAPPING = OrderedDict(
|
||||
[
|
||||
# Base model mapping
|
||||
(Speech2TextConfig, Speech2TextModel),
|
||||
(Wav2Vec2Config, Wav2Vec2Model),
|
||||
(M2M100Config, M2M100Model),
|
||||
(ConvBertConfig, ConvBertModel),
|
||||
@@ -399,6 +400,7 @@ MODEL_FOR_PRETRAINING_MAPPING = OrderedDict(
|
||||
MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
|
||||
[
|
||||
# Model with LM heads mapping
|
||||
(Speech2TextConfig, Speech2TextForConditionalGeneration),
|
||||
(Wav2Vec2Config, Wav2Vec2ForMaskedLM),
|
||||
(M2M100Config, M2M100ForConditionalGeneration),
|
||||
(ConvBertConfig, ConvBertForMaskedLM),
|
||||
|
||||
77
src/transformers/models/speech_to_text/__init__.py
Normal file
77
src/transformers/models/speech_to_text/__init__.py
Normal file
@@ -0,0 +1,77 @@
|
||||
# flake8: noqa
|
||||
# There's no way to ignore "F401 '...' imported but unused" warnings in this
|
||||
# module, but to preserve other warnings. So, don't check this module at all.
|
||||
|
||||
# Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ...file_utils import _BaseLazyModule, is_sentencepiece_available, is_torch_available
|
||||
|
||||
|
||||
_import_structure = {
|
||||
"configuration_speech_to_text": [
|
||||
"SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP",
|
||||
"Speech2TextConfig",
|
||||
],
|
||||
"feature_extraction_speech_to_text": ["Speech2TextFeatureExtractor"],
|
||||
}
|
||||
|
||||
if is_sentencepiece_available():
|
||||
_import_structure["tokenization_speech_to_text"] = ["Speech2TextTokenizer"]
|
||||
_import_structure["processing_speech_to_text"] = ["Speech2TextProcessor"]
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
_import_structure["modeling_speech_to_text"] = [
|
||||
"SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
"Speech2TextForConditionalGeneration",
|
||||
"Speech2TextModel",
|
||||
"Speech2TextPreTrainedModel",
|
||||
]
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_speech_to_text import SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Speech2TextConfig
|
||||
from .feature_extraction_speech_to_text import Speech2TextFeatureExtractor
|
||||
|
||||
if is_sentencepiece_available():
|
||||
from .processing_speech_to_text import Speech2TextProcessor
|
||||
from .tokenization_speech_to_text import Speech2TextTokenizer
|
||||
|
||||
if is_torch_available():
|
||||
from .modeling_speech_to_text import (
|
||||
SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
Speech2TextForConditionalGeneration,
|
||||
Speech2TextModel,
|
||||
Speech2TextPreTrainedModel,
|
||||
)
|
||||
|
||||
else:
|
||||
import importlib
|
||||
import os
|
||||
import sys
|
||||
|
||||
class _LazyModule(_BaseLazyModule):
|
||||
"""
|
||||
Module class that surfaces all objects but only performs associated imports when the objects are requested.
|
||||
"""
|
||||
|
||||
__file__ = globals()["__file__"]
|
||||
__path__ = [os.path.dirname(__file__)]
|
||||
|
||||
def _get_module(self, module_name: str):
|
||||
return importlib.import_module("." + module_name, self.__name__)
|
||||
|
||||
sys.modules[__name__] = _LazyModule(__name__, _import_structure)
|
||||
@@ -0,0 +1,200 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Speech2Text model configuration """
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
"facebook/s2t-small-librispeech-asr": "https://huggingface.co/facebook/s2t-small-librispeech-asr/resolve/main/config.json",
|
||||
# See all Speech2Text models at https://huggingface.co/models?filter=speech_to_text
|
||||
}
|
||||
|
||||
|
||||
class Speech2TextConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a :class:`~transformers.Speech2TextModel`. It is used
|
||||
to instantiate an Speech2Text model according to the specified arguments, defining the model architecture.
|
||||
Instantiating a configuration with the defaults will yield a similar configuration to that of the Speech2Text
|
||||
`facebook/s2t-small-librispeech-asr <https://huggingface.co/facebook/s2t-small-librispeech-asr>`__ architecture.
|
||||
|
||||
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
|
||||
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
|
||||
|
||||
|
||||
Args:
|
||||
vocab_size (:obj:`int`, `optional`, defaults to 50265):
|
||||
Vocabulary size of the Speech2Text model. Defines the number of different tokens that can be represented by
|
||||
the :obj:`inputs_ids` passed when calling :class:`~transformers.Speech2TextModel`
|
||||
d_model (:obj:`int`, `optional`, defaults to 1024):
|
||||
Dimensionality of the layers and the pooler layer.
|
||||
encoder_layers (:obj:`int`, `optional`, defaults to 12):
|
||||
Number of encoder layers.
|
||||
decoder_layers (:obj:`int`, `optional`, defaults to 12):
|
||||
Number of decoder layers.
|
||||
encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
|
||||
Number of attention heads for each attention layer in the Transformer decoder.
|
||||
decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
|
||||
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
|
||||
encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
|
||||
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
|
||||
activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string,
|
||||
:obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
|
||||
dropout (:obj:`float`, `optional`, defaults to 0.1):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The dropout ratio for activations inside the fully connected layer.
|
||||
classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The dropout ratio for classifier.
|
||||
init_std (:obj:`float`, `optional`, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The LayerDrop probability for the encoder. See the `LayerDrop paper <see
|
||||
https://arxiv.org/abs/1909.11556>`__ for more details.
|
||||
decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
|
||||
The LayerDrop probability for the decoder. See the `LayerDrop paper <see
|
||||
https://arxiv.org/abs/1909.11556>`__ for more details.
|
||||
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models).
|
||||
max_source_positions (:obj:`int`, `optional`, defaults to 6000):
|
||||
The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
|
||||
max_target_positions: (:obj:`int`, `optional`, defaults to 1024):
|
||||
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
num_conv_layers (:obj:`int`, `optional`, defaults to 2):
|
||||
Number of 1D convolutional layers in the conv module.
|
||||
conv_kernel_sizes (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(5, 5)`):
|
||||
A tuple of integers defining the kernel size of each 1D convolutional layer in the conv module. The length
|
||||
of :obj:`conv_kernel_sizes` has to match :obj:`num_conv_layers`.
|
||||
conv_channels (:obj:`int`, `optional`, defaults to 1024):
|
||||
An integer defining the number of output channels of each convolution layers except the final one in the
|
||||
conv module.
|
||||
input_feat_per_channel (:obj:`int`, `optional`, defaults to 80):
|
||||
An integer specifying the size of feature vector. This is also the dimentions of log-mel filter-bank
|
||||
features.
|
||||
input_channels (:obj:`int`, `optional`, defaults to 1):
|
||||
An integer specifying number of input channels of the input feature vector.
|
||||
|
||||
Example::
|
||||
|
||||
>>> from transformers import Speech2TextModel, Speech2TextConfig
|
||||
|
||||
>>> # Initializing a Speech2Text s2t_transformer_s style configuration
|
||||
>>> configuration = Speech2TextConfig()
|
||||
|
||||
>>> # Initializing a model from the s2t_transformer_s style configuration
|
||||
>>> model = Speech2TextModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
"""
|
||||
model_type = "speech_to_text"
|
||||
keys_to_ignore_at_inference = ["past_key_values"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=10000,
|
||||
encoder_layers=12,
|
||||
encoder_ffn_dim=2048,
|
||||
encoder_attention_heads=4,
|
||||
decoder_layers=6,
|
||||
decoder_ffn_dim=2048,
|
||||
decoder_attention_heads=4,
|
||||
encoder_layerdrop=0.0,
|
||||
decoder_layerdrop=0.0,
|
||||
use_cache=True,
|
||||
is_encoder_decoder=True,
|
||||
activation_function="relu",
|
||||
d_model=256,
|
||||
dropout=0.1,
|
||||
attention_dropout=0.0,
|
||||
activation_dropout=0.0,
|
||||
init_std=0.02,
|
||||
decoder_start_token_id=2,
|
||||
classifier_dropout=0.0,
|
||||
scale_embedding=True,
|
||||
gradient_checkpointing=False,
|
||||
pad_token_id=1,
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
max_source_positions=6000,
|
||||
max_target_positions=1024,
|
||||
num_conv_layers=2,
|
||||
conv_kernel_sizes=(5, 5),
|
||||
conv_channels=1024,
|
||||
input_feat_per_channel=80,
|
||||
input_channels=1,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(
|
||||
pad_token_id=pad_token_id,
|
||||
bos_token_id=bos_token_id,
|
||||
eos_token_id=eos_token_id,
|
||||
is_encoder_decoder=is_encoder_decoder,
|
||||
decoder_start_token_id=decoder_start_token_id,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.d_model = d_model
|
||||
self.encoder_ffn_dim = encoder_ffn_dim
|
||||
self.encoder_layers = encoder_layers
|
||||
self.encoder_attention_heads = encoder_attention_heads
|
||||
self.decoder_ffn_dim = decoder_ffn_dim
|
||||
self.decoder_layers = decoder_layers
|
||||
self.decoder_attention_heads = decoder_attention_heads
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.activation_dropout = activation_dropout
|
||||
self.activation_function = activation_function
|
||||
self.init_std = init_std
|
||||
self.encoder_layerdrop = encoder_layerdrop
|
||||
self.decoder_layerdrop = decoder_layerdrop
|
||||
self.classifier_dropout = classifier_dropout
|
||||
self.use_cache = use_cache
|
||||
self.num_hidden_layers = encoder_layers
|
||||
self.gradient_checkpointing = gradient_checkpointing
|
||||
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
|
||||
self.max_source_positions = max_source_positions
|
||||
self.max_target_positions = max_target_positions
|
||||
self.num_conv_layers = num_conv_layers
|
||||
self.conv_kernel_sizes = list(conv_kernel_sizes)
|
||||
self.conv_channels = conv_channels
|
||||
self.input_feat_per_channel = input_feat_per_channel
|
||||
self.input_channels = input_channels
|
||||
|
||||
if len(self.conv_kernel_sizes) != self.num_conv_layers:
|
||||
raise ValueError(
|
||||
"Configuration for convolutional module is incorrect."
|
||||
"It is required that `len(config.conv_kernel_sizes)` == `config.num_conv_layers`"
|
||||
f"but is `len(config.conv_kernel_sizes) = {len(self.conv_kernel_sizes)}`,"
|
||||
f"`config.num_conv_layers = {self.num_conv_layers}`."
|
||||
)
|
||||
|
||||
@property
|
||||
def num_attention_heads(self) -> int:
|
||||
return self.encoder_attention_heads
|
||||
|
||||
@property
|
||||
def hidden_size(self) -> int:
|
||||
return self.d_model
|
||||
@@ -0,0 +1,112 @@
|
||||
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from transformers import Speech2TextConfig, Speech2TextForConditionalGeneration
|
||||
|
||||
|
||||
def remove_ignore_keys_(state_dict):
|
||||
ignore_keys = [
|
||||
"encoder.version",
|
||||
"decoder.version",
|
||||
"model.encoder.version",
|
||||
"model.decoder.version",
|
||||
"decoder.output_projection.weight",
|
||||
"_float_tensor",
|
||||
"encoder.embed_positions._float_tensor",
|
||||
"decoder.embed_positions._float_tensor",
|
||||
]
|
||||
for k in ignore_keys:
|
||||
state_dict.pop(k, None)
|
||||
|
||||
|
||||
def rename_keys(s_dict):
|
||||
keys = list(s_dict.keys())
|
||||
for key in keys:
|
||||
if "transformer_layers" in key:
|
||||
s_dict[key.replace("transformer_layers", "layers")] = s_dict.pop(key)
|
||||
elif "subsample" in key:
|
||||
s_dict[key.replace("subsample", "conv")] = s_dict.pop(key)
|
||||
|
||||
|
||||
def make_linear_from_emb(emb):
|
||||
vocab_size, emb_size = emb.weight.shape
|
||||
lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
|
||||
lin_layer.weight.data = emb.weight.data
|
||||
return lin_layer
|
||||
|
||||
|
||||
def convert_fairseq_s2t_checkpoint_to_tfms(checkpoint_path, pytorch_dump_folder_path):
|
||||
m2m_100 = torch.load(checkpoint_path, map_location="cpu")
|
||||
args = m2m_100["args"]
|
||||
state_dict = m2m_100["model"]
|
||||
lm_head_weights = state_dict["decoder.output_projection.weight"]
|
||||
|
||||
remove_ignore_keys_(state_dict)
|
||||
rename_keys(state_dict)
|
||||
|
||||
vocab_size = state_dict["decoder.embed_tokens.weight"].shape[0]
|
||||
|
||||
tie_embeds = args.share_decoder_input_output_embed
|
||||
|
||||
conv_kernel_sizes = [int(i) for i in args.conv_kernel_sizes.split(",")]
|
||||
config = Speech2TextConfig(
|
||||
vocab_size=vocab_size,
|
||||
max_source_positions=args.max_source_positions,
|
||||
max_target_positions=args.max_target_positions,
|
||||
encoder_layers=args.encoder_layers,
|
||||
decoder_layers=args.decoder_layers,
|
||||
encoder_attention_heads=args.encoder_attention_heads,
|
||||
decoder_attention_heads=args.decoder_attention_heads,
|
||||
encoder_ffn_dim=args.encoder_ffn_embed_dim,
|
||||
decoder_ffn_dim=args.decoder_ffn_embed_dim,
|
||||
d_model=args.encoder_embed_dim,
|
||||
dropout=args.dropout,
|
||||
attention_dropout=args.attention_dropout,
|
||||
activation_dropout=args.activation_dropout,
|
||||
activation_function="relu",
|
||||
num_conv_layers=len(conv_kernel_sizes),
|
||||
conv_channels=args.conv_channels,
|
||||
conv_kernel_sizes=conv_kernel_sizes,
|
||||
input_feat_per_channel=args.input_feat_per_channel,
|
||||
input_channels=args.input_channels,
|
||||
tie_word_embeddings=tie_embeds,
|
||||
num_beams=5,
|
||||
max_length=200,
|
||||
use_cache=True,
|
||||
decoder_start_token_id=2,
|
||||
early_stopping=True,
|
||||
)
|
||||
|
||||
model = Speech2TextForConditionalGeneration(config)
|
||||
model.model.load_state_dict(state_dict)
|
||||
if tie_embeds:
|
||||
model.lm_head = make_linear_from_emb(model.model.decoder.embed_tokens)
|
||||
else:
|
||||
model.lm_head.weight.data = lm_head_weights
|
||||
|
||||
model.save_pretrained(pytorch_dump_folder_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
# Required parameters
|
||||
parser.add_argument("fairseq_path", type=str, help="Path to the fairseq model (.pt) file.")
|
||||
parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
|
||||
args = parser.parse_args()
|
||||
convert_fairseq_s2t_checkpoint_to_tfms(args.fairseq_path, args.pytorch_dump_folder_path)
|
||||
@@ -0,0 +1,225 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2021 The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Feature extractor class for Speech2Text
|
||||
"""
|
||||
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
|
||||
from ...feature_extraction_utils import BatchFeature
|
||||
from ...file_utils import PaddingStrategy, TensorType, is_torch_available, is_torchaudio_available
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
if is_torchaudio_available():
|
||||
import torchaudio.compliance.kaldi as ta_kaldi
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class Speech2TextFeatureExtractor(SequenceFeatureExtractor):
|
||||
r"""
|
||||
Constructs a Speech2Text feature extractor.
|
||||
|
||||
This feature extractor inherits from :class:`~transformers.Speech2TextFeatureExtractor` which contains most of the
|
||||
main methods. Users should refer to this superclass for more information regarding those methods.
|
||||
|
||||
This class extracts mel-filter bank features from raw speech using TorchAudio and applies utterance-level cepstral
|
||||
mean and variance normalization to the extracted features.
|
||||
|
||||
Args:
|
||||
feature_size (:obj:`int`, defaults to 80):
|
||||
The feature dimension of the extracted features.
|
||||
sampling_rate (:obj:`int`, defaults to 16000):
|
||||
The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
|
||||
num_mel_bins (:obj:`int`, defaults to 80):
|
||||
Number of Mel-frequency bins.
|
||||
padding_value (:obj:`float`, defaults to 0.0):
|
||||
The value that is used to fill the padding vectors.
|
||||
do_ceptral_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not to apply utterance-level cepstral mean and variance normalization to extracted features.
|
||||
normalize_means (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not to zero-mean normalize the extracted features.
|
||||
normalize_vars (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not to unit-variance normalize the extracted features.
|
||||
"""
|
||||
|
||||
model_input_names = ["input_features", "attention_mask"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
feature_size=80,
|
||||
sampling_rate=16000,
|
||||
num_mel_bins=80,
|
||||
padding_value=0.0,
|
||||
do_ceptral_normalize=True,
|
||||
normalize_means=True,
|
||||
normalize_vars=True,
|
||||
**kwargs
|
||||
):
|
||||
if not is_torchaudio_available():
|
||||
raise ImportError("`Speech2TextFeatureExtractor` requires torchaudio: `pip install torchaudio`.")
|
||||
super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
|
||||
self.num_mel_bins = num_mel_bins
|
||||
self.do_ceptral_normalize = do_ceptral_normalize
|
||||
self.normalize_means = normalize_means
|
||||
self.normalize_vars = normalize_vars
|
||||
self.return_attention_mask = True
|
||||
|
||||
def _extract_fbank_features(
|
||||
self,
|
||||
waveform: np.ndarray,
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Get mel-filter bank features using TorchAudio. Note that TorchAudio requires 16-bit signed integers as inputs
|
||||
and hence the waveform should not be normalized before feature extraction.
|
||||
"""
|
||||
waveform = waveform * (2 ** 15) # Kaldi compliance: 16-bit signed integers
|
||||
waveform = torch.from_numpy(waveform).unsqueeze(0)
|
||||
features = ta_kaldi.fbank(waveform, num_mel_bins=self.num_mel_bins, sample_frequency=self.sampling_rate)
|
||||
return features.numpy()
|
||||
|
||||
@staticmethod
|
||||
def utterance_cmvn(
|
||||
x: np.ndarray, normalize_means: Optional[bool] = True, normalize_vars: Optional[bool] = True
|
||||
) -> np.ndarray:
|
||||
mean = x.mean(axis=0)
|
||||
square_sums = (x ** 2).sum(axis=0)
|
||||
|
||||
if normalize_means:
|
||||
x = np.subtract(x, mean)
|
||||
if normalize_vars:
|
||||
var = square_sums / x.shape[0] - mean ** 2
|
||||
std = np.sqrt(np.maximum(var, 1e-10))
|
||||
x = np.divide(x, std)
|
||||
|
||||
return x
|
||||
|
||||
def normalize(self, input_values: List[np.ndarray]) -> List[np.ndarray]:
|
||||
return [self.utterance_cmvn(x, self.normalize_means, self.normalize_vars) for x in input_values]
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
|
||||
padding: Union[bool, str, PaddingStrategy] = False,
|
||||
max_length: Optional[int] = None,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
sampling_rate: Optional[int] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
**kwargs
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
Main method to featurize and prepare for the model one or several sequence(s). sequences.
|
||||
|
||||
Args:
|
||||
raw_speech (:obj:`np.ndarray`, :obj:`List[float]`, :obj:`List[np.ndarray]`, :obj:`List[List[float]]`):
|
||||
The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
|
||||
values, a list of numpy arrays or a list of list of float values.
|
||||
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
|
||||
Select a strategy to pad the returned sequences (according to the model's padding side and padding
|
||||
index) among:
|
||||
|
||||
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
|
||||
single sequence if provided).
|
||||
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
|
||||
maximum acceptable input length for the model if that argument is not provided.
|
||||
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
|
||||
different lengths).
|
||||
max_length (:obj:`int`, `optional`):
|
||||
Maximum length of the returned list and optionally padding length (see above).
|
||||
pad_to_multiple_of (:obj:`int`, `optional`):
|
||||
If set will pad the sequence to a multiple of the provided value.
|
||||
|
||||
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
|
||||
>= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
|
||||
return_attention_mask (:obj:`bool`, `optional`):
|
||||
Whether to return the attention mask. If left to the default, will return the attention mask according
|
||||
to the specific feature_extractor's default.
|
||||
|
||||
`What are attention masks? <../glossary.html#attention-mask>`__
|
||||
|
||||
.. note::
|
||||
|
||||
For Speech2TextTransoformer models, :obj:`attention_mask` should alwys be passed for batched
|
||||
inference, to avoid subtle bugs.
|
||||
|
||||
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
|
||||
If set, will return tensors instead of list of python integers. Acceptable values are:
|
||||
|
||||
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
||||
* :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
|
||||
* :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
|
||||
sampling_rate (:obj:`int`, `optional`):
|
||||
The sampling rate at which the :obj:`raw_speech` input was sampled. It is strongly recommended to pass
|
||||
:obj:`sampling_rate` at the forward call to prevent silent errors.
|
||||
padding_value (:obj:`float`, defaults to 0.0):
|
||||
The value that is used to fill the padding values / vectors.
|
||||
"""
|
||||
|
||||
if sampling_rate is not None:
|
||||
if sampling_rate != self.sampling_rate:
|
||||
raise ValueError(
|
||||
f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of {self.sampling_rate}."
|
||||
f"Please make sure that the provided `raw_speech` input was sampled with {self.sampling_rate} and not {sampling_rate}."
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
"It is strongly recommended to pass the `sampling_rate` argument to this function."
|
||||
"Failing to do so can result in silent errors that might be hard to debug."
|
||||
)
|
||||
|
||||
is_batched = bool(
|
||||
isinstance(raw_speech, (list, tuple))
|
||||
and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
|
||||
)
|
||||
|
||||
# make sure input is in list format
|
||||
if is_batched and not isinstance(raw_speech[0], np.ndarray):
|
||||
raw_speech = [np.asarray(speech) for speech in raw_speech]
|
||||
elif not is_batched and not isinstance(raw_speech, np.ndarray):
|
||||
raw_speech = np.asarray(raw_speech)
|
||||
|
||||
# always return batch
|
||||
if not is_batched:
|
||||
raw_speech = [raw_speech]
|
||||
|
||||
# extract fbank features
|
||||
features = [self._extract_fbank_features(waveform) for waveform in raw_speech]
|
||||
|
||||
# Utterance-level cepstral mean and variance normalization
|
||||
if self.do_ceptral_normalize:
|
||||
features = self.normalize(features)
|
||||
|
||||
# convert into correct format for padding
|
||||
encoded_inputs = BatchFeature({"input_features": features})
|
||||
|
||||
padded_inputs = self.pad(
|
||||
encoded_inputs,
|
||||
padding=padding,
|
||||
max_length=max_length,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
return_attention_mask=return_attention_mask,
|
||||
return_tensors=return_tensors,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
return padded_inputs
|
||||
1353
src/transformers/models/speech_to_text/modeling_speech_to_text.py
Executable file
1353
src/transformers/models/speech_to_text/modeling_speech_to_text.py
Executable file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,144 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2021 The HuggingFace Inc. team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Speech processor class for Speech2Text
|
||||
"""
|
||||
from contextlib import contextmanager
|
||||
|
||||
from .feature_extraction_speech_to_text import Speech2TextFeatureExtractor
|
||||
from .tokenization_speech_to_text import Speech2TextTokenizer
|
||||
|
||||
|
||||
class Speech2TextProcessor:
|
||||
r"""
|
||||
Constructs a Speech2Text processor which wraps a Speech2Text feature extractor and a Speech2Text tokenizer into a
|
||||
single processor.
|
||||
|
||||
:class:`~transformers.Speech2TextProcessor` offers all the functionalities of
|
||||
:class:`~transformers.Speech2TextFeatureExtractor` and :class:`~transformers.Speech2TextTokenizer`. See the
|
||||
:meth:`~transformers.Speech2TextProcessor.__call__` and :meth:`~transformers.Speech2TextProcessor.decode` for more
|
||||
information.
|
||||
|
||||
Args:
|
||||
feature_extractor (:obj:`Speech2TextFeatureExtractor`):
|
||||
An instance of :class:`~transformers.Speech2TextFeatureExtractor`. The feature extractor is a required
|
||||
input.
|
||||
tokenizer (:obj:`Speech2TextTokenizer`):
|
||||
An instance of :class:`~transformers.Speech2TextTokenizer`. The tokenizer is a required input.
|
||||
"""
|
||||
|
||||
def __init__(self, feature_extractor, tokenizer):
|
||||
if not isinstance(feature_extractor, Speech2TextFeatureExtractor):
|
||||
raise ValueError(
|
||||
f"`feature_extractor` has to be of type {Speech2TextFeatureExtractor.__class__}, but is {type(feature_extractor)}"
|
||||
)
|
||||
if not isinstance(tokenizer, Speech2TextTokenizer):
|
||||
raise ValueError(
|
||||
f"`tokenizer` has to be of type {Speech2TextTokenizer.__class__}, but is {type(tokenizer)}"
|
||||
)
|
||||
|
||||
self.feature_extractor = feature_extractor
|
||||
self.tokenizer = tokenizer
|
||||
self.current_processor = self.feature_extractor
|
||||
|
||||
def save_pretrained(self, save_directory):
|
||||
"""
|
||||
Save a Speech2Text feature extractor object and Speech2Text tokenizer object to the directory
|
||||
``save_directory``, so that it can be re-loaded using the
|
||||
:func:`~transformers.Speech2TextProcessor.from_pretrained` class method.
|
||||
|
||||
.. note::
|
||||
|
||||
This class method is simply calling :meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` and
|
||||
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained`. Please refer to the
|
||||
docstrings of the methods above for more information.
|
||||
|
||||
Args:
|
||||
save_directory (:obj:`str` or :obj:`os.PathLike`):
|
||||
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
|
||||
be created if it does not exist).
|
||||
"""
|
||||
|
||||
self.feature_extractor.save_pretrained(save_directory)
|
||||
self.tokenizer.save_pretrained(save_directory)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
||||
r"""
|
||||
Instantiate a :class:`~transformers.Speech2TextProcessor` from a pretrained Speech2Text processor.
|
||||
|
||||
.. note::
|
||||
|
||||
This class method is simply calling Speech2TextFeatureExtractor's
|
||||
:meth:`~transformers.PreTrainedFeatureExtractor.from_pretrained` and Speech2TextTokenizer's
|
||||
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`. Please refer to the
|
||||
docstrings of the methods above for more information.
|
||||
|
||||
Args:
|
||||
pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
|
||||
This can be either:
|
||||
|
||||
- a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
|
||||
huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
|
||||
namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
|
||||
- a path to a `directory` containing a feature extractor file saved using the
|
||||
:meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` method, e.g.,
|
||||
``./my_model_directory/``.
|
||||
- a path or url to a saved feature extractor JSON `file`, e.g.,
|
||||
``./my_model_directory/feature_extraction_config.json``.
|
||||
**kwargs
|
||||
Additional keyword arguments passed along to both :class:`~transformers.PreTrainedFeatureExtractor` and
|
||||
:class:`~transformers.PreTrainedTokenizer`
|
||||
"""
|
||||
feature_extractor = Speech2TextFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||
tokenizer = Speech2TextTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||
|
||||
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
"""
|
||||
When used in normal mode, this method forwards all its arguments to Speech2TextFeatureExtractor's
|
||||
:meth:`~transformers.Speech2TextFeatureExtractor.__call__` and returns its output. If used in the context
|
||||
:meth:`~transformers.Speech2TextProcessor.as_target_processor` this method forwards all its arguments to
|
||||
Speech2TextTokenizer's :meth:`~transformers.Speech2TextTokenizer.__call__`. Please refer to the doctsring of
|
||||
the above two methods for more information.
|
||||
"""
|
||||
return self.current_processor(*args, **kwargs)
|
||||
|
||||
def batch_decode(self, *args, **kwargs):
|
||||
"""
|
||||
This method forwards all its arguments to Speech2TextTokenizer's
|
||||
:meth:`~transformers.PreTrainedTokenizer.batch_decode`. Please refer to the docstring of this method for more
|
||||
information.
|
||||
"""
|
||||
return self.tokenizer.batch_decode(*args, **kwargs)
|
||||
|
||||
def decode(self, *args, **kwargs):
|
||||
"""
|
||||
This method forwards all its arguments to Speech2TextTokenizer's
|
||||
:meth:`~transformers.PreTrainedTokenizer.decode`. Please refer to the docstring of this method for more
|
||||
information.
|
||||
"""
|
||||
return self.tokenizer.decode(*args, **kwargs)
|
||||
|
||||
@contextmanager
|
||||
def as_target_processor(self):
|
||||
"""
|
||||
Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
|
||||
Speech2Text.
|
||||
"""
|
||||
self.current_processor = self.tokenizer
|
||||
yield
|
||||
self.current_processor = self.feature_extractor
|
||||
@@ -0,0 +1,259 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Tokenization classes for Speech2Text."""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from shutil import copyfile
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
|
||||
import sentencepiece
|
||||
|
||||
from ...tokenization_utils import PreTrainedTokenizer
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
SPIECE_UNDERLINE = "▁"
|
||||
|
||||
VOCAB_FILES_NAMES = {
|
||||
"vocab_file": "vocab.json",
|
||||
"spm_file": "sentencepiece.bpe.model",
|
||||
}
|
||||
|
||||
PRETRAINED_VOCAB_FILES_MAP = {
|
||||
"vocab_file": {
|
||||
"facebook/s2t-small-librispeech-asr": "https://huggingface.co/facebook/s2t-small-librispeech-asr/resolve/main/vocab.json",
|
||||
},
|
||||
"spm_file": {
|
||||
"facebook/s2t-small-librispeech-asr": "https://huggingface.co/facebook/s2t-small-librispeech-asr/resolve/main/sentencepiece.bpe.model"
|
||||
},
|
||||
}
|
||||
|
||||
MAX_MODEL_INPUT_SIZES = {
|
||||
"facebook/s2t-small-librispeech-asr": 1024,
|
||||
}
|
||||
|
||||
MUSTC_LANGS = ["pt", "fr", "ru", "nl", "ro", "it", "es", "de"]
|
||||
|
||||
LANGUAGES = {"mustc": MUSTC_LANGS}
|
||||
|
||||
|
||||
class Speech2TextTokenizer(PreTrainedTokenizer):
|
||||
"""
|
||||
Construct an Speech2Text tokenizer.
|
||||
|
||||
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains some of the main methods.
|
||||
Users should refer to the superclass for more information regarding such methods.
|
||||
|
||||
Args:
|
||||
vocab_file (:obj:`str`):
|
||||
File containing the vocabulary.
|
||||
spm_file (:obj:`str`):
|
||||
Path to the `SentencePiece <https://github.com/google/sentencepiece>`__ model file
|
||||
bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
|
||||
The beginning of sentence token.
|
||||
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
|
||||
The end of sentence token.
|
||||
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
|
||||
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||
token instead.
|
||||
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
|
||||
The token used for padding, for example when batching sequences of different lengths.
|
||||
do_upper_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to uppercase the output when decoding.
|
||||
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to lowercase the input when tokenizing.
|
||||
tgt_lang (:obj:`str`, `optional`):
|
||||
A string representing the target language.
|
||||
**kwargs
|
||||
Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = MAX_MODEL_INPUT_SIZES
|
||||
model_input_names = ["input_ids", "attention_mask"]
|
||||
|
||||
prefix_tokens: List[int] = []
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file,
|
||||
spm_file,
|
||||
bos_token="<s>",
|
||||
eos_token="</s>",
|
||||
pad_token="<pad>",
|
||||
unk_token="<unk>",
|
||||
do_upper_case=False,
|
||||
do_lower_case=False,
|
||||
tgt_lang=None,
|
||||
lang_codes=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
do_upper_case=do_upper_case,
|
||||
do_lower_case=do_lower_case,
|
||||
tgt_lang=tgt_lang,
|
||||
lang_codes=lang_codes,
|
||||
**kwargs,
|
||||
)
|
||||
self.do_upper_case = do_upper_case
|
||||
self.do_lower_case = do_lower_case
|
||||
|
||||
self.encoder = load_json(vocab_file)
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
self.spm_file = spm_file
|
||||
self.sp_model = load_spm(spm_file)
|
||||
|
||||
if lang_codes is not None:
|
||||
self.lang_codes = lang_codes
|
||||
self.langs = LANGUAGES[lang_codes]
|
||||
self.lang_tokens = [f"<lang:{lang}>" for lang in self.langs]
|
||||
self.lang_code_to_id = {lang: self.sp_model.PieceToId(f"<lang:{lang}>") for lang in self.langs}
|
||||
|
||||
self._additional_special_tokens = self.lang_tokens
|
||||
self._tgt_lang = tgt_lang if tgt_lang is not None else self.langs[0]
|
||||
|
||||
self.set_tgt_lang_special_tokens(self._tgt_lang)
|
||||
else:
|
||||
self.lang_code_to_id = {}
|
||||
|
||||
@property
|
||||
def vocab_size(self) -> int:
|
||||
return len(self.encoder)
|
||||
|
||||
@property
|
||||
def tgt_lang(self) -> str:
|
||||
return self._tgt_lang
|
||||
|
||||
@tgt_lang.setter
|
||||
def tgt_lang(self, new_tgt_lang) -> None:
|
||||
self._tgt_lang = new_tgt_lang
|
||||
self.set_tgt_lang_special_tokens(new_tgt_lang)
|
||||
|
||||
def set_tgt_lang_special_tokens(self, tgt_lang: str) -> None:
|
||||
"""Reset the special tokens to the target language setting. prefix=[eos, tgt_lang_code] and suffix=[eos]."""
|
||||
lang_code_id = self.lang_code_to_id[tgt_lang]
|
||||
self.prefix_tokens = [lang_code_id]
|
||||
|
||||
def _tokenize(self, text: str) -> List[str]:
|
||||
return self.sp_model.EncodeAsPieces(text)
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
return self.encoder.get(token, self.encoder[self.unk_token])
|
||||
|
||||
def _convert_id_to_token(self, index: int) -> str:
|
||||
"""Converts an index (integer) in a token (str) using the decoder."""
|
||||
return self.decoder.get(index, self.unk_token)
|
||||
|
||||
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
||||
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
|
||||
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
|
||||
|
||||
if self.do_upper_case:
|
||||
out_string = out_string.upper()
|
||||
return out_string
|
||||
|
||||
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
|
||||
"""Build model inputs from a sequence by appending eos_token_id."""
|
||||
if token_ids_1 is None:
|
||||
return self.prefix_tokens + token_ids_0 + [self.eos_token_id]
|
||||
# We don't expect to process pairs, but leave the pair logic for API consistency
|
||||
return self.prefix_tokens + token_ids_0 + token_ids_1 + [self.eos_token_id]
|
||||
|
||||
def get_special_tokens_mask(
|
||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
||||
) -> List[int]:
|
||||
"""
|
||||
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||
special tokens using the tokenizer ``prepare_for_model`` method.
|
||||
|
||||
Args:
|
||||
token_ids_0 (:obj:`List[int]`):
|
||||
List of IDs.
|
||||
token_ids_1 (:obj:`List[int]`, `optional`):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not the token list is already formatted with special tokens for the model.
|
||||
|
||||
Returns:
|
||||
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
||||
"""
|
||||
|
||||
if already_has_special_tokens:
|
||||
if token_ids_1 is not None:
|
||||
raise ValueError(
|
||||
"You should not supply a second sequence if the provided sequence of "
|
||||
"ids is already formatted with special tokens for the model."
|
||||
)
|
||||
return list(map(lambda x: 1 if x in [self.bos_token_id, self.eos_token_id] else 0, token_ids_0))
|
||||
prefix_ones = [1] * len(self.prefix_tokens)
|
||||
suffix_ones = [1]
|
||||
if token_ids_1 is None:
|
||||
return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
|
||||
return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
|
||||
|
||||
def get_vocab(self) -> Dict:
|
||||
vocab = self.encoder.copy()
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
def __getstate__(self) -> Dict:
|
||||
state = self.__dict__.copy()
|
||||
state["sp_model"] = None
|
||||
return state
|
||||
|
||||
def __setstate__(self, d: Dict) -> None:
|
||||
self.__dict__ = d
|
||||
self.sp_model = load_spm(self.spm_file)
|
||||
|
||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||
save_dir = Path(save_directory)
|
||||
assert save_dir.is_dir(), f"{save_directory} should be a directory"
|
||||
vocab_save_path = save_dir / (
|
||||
(filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"]
|
||||
)
|
||||
spm_save_path = save_dir / (
|
||||
(filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["spm_file"]
|
||||
)
|
||||
|
||||
save_json(self.encoder, vocab_save_path)
|
||||
|
||||
if not spm_save_path.exists():
|
||||
copyfile(self.spm_file, spm_save_path)
|
||||
|
||||
return (str(vocab_save_path), str(spm_save_path))
|
||||
|
||||
|
||||
def load_spm(path: str) -> sentencepiece.SentencePieceProcessor:
|
||||
spm = sentencepiece.SentencePieceProcessor()
|
||||
spm.Load(str(path))
|
||||
return spm
|
||||
|
||||
|
||||
def load_json(path: str) -> Union[Dict, List]:
|
||||
with open(path, "r") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def save_json(data, path: str) -> None:
|
||||
with open(path, "w") as f:
|
||||
json.dump(data, f, indent=2)
|
||||
@@ -38,6 +38,7 @@ from .file_utils import (
|
||||
is_tokenizers_available,
|
||||
is_torch_available,
|
||||
is_torch_tpu_available,
|
||||
is_torchaudio_available,
|
||||
)
|
||||
from .integrations import is_optuna_available, is_ray_available
|
||||
|
||||
@@ -195,6 +196,19 @@ def require_torch_scatter(test_case):
|
||||
return test_case
|
||||
|
||||
|
||||
def require_torchaudio(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires torchaudio.
|
||||
|
||||
These tests are skipped when torchaudio isn't installed.
|
||||
|
||||
"""
|
||||
if not is_torchaudio_available:
|
||||
return unittest.skip("test requires torchaudio")(test_case)
|
||||
else:
|
||||
return test_case
|
||||
|
||||
|
||||
def require_tf(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires TensorFlow.
|
||||
|
||||
@@ -2160,6 +2160,27 @@ class RobertaModel:
|
||||
requires_pytorch(self)
|
||||
|
||||
|
||||
SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST = None
|
||||
|
||||
|
||||
class Speech2TextForConditionalGeneration:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
|
||||
class Speech2TextModel:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_pytorch(self)
|
||||
|
||||
|
||||
SQUEEZEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
|
||||
|
||||
|
||||
|
||||
@@ -92,6 +92,20 @@ class ReformerTokenizer:
|
||||
requires_sentencepiece(self)
|
||||
|
||||
|
||||
class Speech2TextProcessor:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_sentencepiece(self)
|
||||
|
||||
|
||||
class Speech2TextTokenizer:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_sentencepiece(self)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(self, *args, **kwargs):
|
||||
requires_sentencepiece(self)
|
||||
|
||||
|
||||
class T5Tokenizer:
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_sentencepiece(self)
|
||||
|
||||
Reference in New Issue
Block a user