diff --git a/docs/source/en/index.md b/docs/source/en/index.md index f45806e90b..29eb014148 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -439,7 +439,7 @@ Flax), PyTorch, and/or TensorFlow. | Transformer-XL | ✅ | ❌ | ✅ | ✅ | ❌ | | TrOCR | ❌ | ❌ | ✅ | ❌ | ❌ | | TVLT | ❌ | ❌ | ✅ | ❌ | ❌ | -| UMT5 | ✅ | ✅ | ✅ | ✅ | ✅ | +| UMT5 | ❌ | ❌ | ✅ | ❌ | ❌ | | UniSpeech | ❌ | ❌ | ✅ | ❌ | ❌ | | UniSpeechSat | ❌ | ❌ | ✅ | ❌ | ❌ | | UPerNet | ❌ | ❌ | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/umt5.md b/docs/source/en/model_doc/umt5.md index 05a6b14319..d7dbc69e8e 100644 --- a/docs/source/en/model_doc/umt5.md +++ b/docs/source/en/model_doc/umt5.md @@ -73,6 +73,9 @@ The conversion script is also different because the model was saved in t5x's lat ['nyone who drink a alcohol A A. This I'] ``` +## UMT5Config + +[[autodoc]] UMT5Config ## UMT5Model diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 9e6fa1bf38..8245f49e0e 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -524,7 +524,7 @@ _import_structure = { "TvltFeatureExtractor", "TvltProcessor", ], - "models.umt5": [], + "models.umt5": ["UMT5Config"], "models.unispeech": [ "UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP", "UniSpeechConfig", @@ -4388,6 +4388,7 @@ if TYPE_CHECKING: ) from .models.trocr import TROCR_PRETRAINED_CONFIG_ARCHIVE_MAP, TrOCRConfig, TrOCRProcessor from .models.tvlt import TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP, TvltConfig, TvltFeatureExtractor, TvltProcessor + from .models.umt5 import UMT5Config from .models.unispeech import UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechConfig from .models.unispeech_sat import UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechSatConfig from .models.upernet import UperNetConfig diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index bc5b2a420f..c0fb12586d 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -194,7 +194,7 @@ CONFIG_MAPPING_NAMES = OrderedDict( ("transfo-xl", "TransfoXLConfig"), ("trocr", "TrOCRConfig"), ("tvlt", "TvltConfig"), - ("umt5", "MT5Config"), + ("umt5", "UMT5Config"), ("unispeech", "UniSpeechConfig"), ("unispeech-sat", "UniSpeechSatConfig"), ("upernet", "UperNetConfig"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index ad2d310663..b5880fbe9f 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -324,6 +324,13 @@ else: ("tapas", ("TapasTokenizer", None)), ("tapex", ("TapexTokenizer", None)), ("transfo-xl", ("TransfoXLTokenizer", None)), + ( + "umt5", + ( + "T5Tokenizer" if is_sentencepiece_available() else None, + "T5TokenizerFast" if is_tokenizers_available() else None, + ), + ), ("vilt", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("visual_bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)), diff --git a/src/transformers/models/umt5/__init__.py b/src/transformers/models/umt5/__init__.py index ef96931393..2ad1c56d23 100644 --- a/src/transformers/models/umt5/__init__.py +++ b/src/transformers/models/umt5/__init__.py @@ -17,7 +17,8 @@ from typing import TYPE_CHECKING from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available -_import_structure = {} +_import_structure = {"configuration_umt5": ["UMT5Config", "UMT5OnnxConfig"]} + try: if not is_torch_available(): @@ -34,6 +35,8 @@ else: ] if TYPE_CHECKING: + from .configuration_umt5 import UMT5Config, UMT5OnnxConfig + try: if not is_torch_available(): raise OptionalDependencyNotAvailable() diff --git a/src/transformers/models/umt5/configuration_umt5.py b/src/transformers/models/umt5/configuration_umt5.py new file mode 100644 index 0000000000..462b57f7f1 --- /dev/null +++ b/src/transformers/models/umt5/configuration_umt5.py @@ -0,0 +1,182 @@ +# coding=utf-8 +# Copyright 2023, The T5 Authors and HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" UMT5 model configuration""" +from typing import Mapping + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxSeq2SeqConfigWithPast +from ...utils import logging + + +logger = logging.get_logger(__name__) + +UMT5_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "google/umt5-small": "https://huggingface.co/google/umt5-small/resolve/main/config.json", + # See all umt5 models at https://huggingface.co/models?filter=umt5 +} + + +class UMT5Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`UMT5Model`]. It is used to instantiate a UMT5 + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the UMT5 + [google/umt5-small](https://huggingface.co/google/umt5-small) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Arguments: + vocab_size (`int`, *optional*, defaults to 250112): + Vocabulary size of the UMT5 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`UMT5Model`] or [`TFUMT5Model`]. + d_model (`int`, *optional*, defaults to 512): + Size of the encoder layers and the pooler layer. + d_kv (`int`, *optional*, defaults to 64): + Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model // + num_heads`. + d_ff (`int`, *optional*, defaults to 1024): + Size of the intermediate feed forward layer in each `UMT5Block`. + num_layers (`int`, *optional*, defaults to 8): + Number of hidden layers in the Transformer encoder. + num_decoder_layers (`int`, *optional*): + Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set. + num_heads (`int`, *optional*, defaults to 6): + Number of attention heads for each attention layer in the Transformer encoder. + relative_attention_num_buckets (`int`, *optional*, defaults to 32): + The number of buckets to use for each attention layer. + relative_attention_max_distance (`int`, *optional*, defaults to 128): + The maximum distance of the longer sequences for the bucket separation. + dropout_rate (`float`, *optional*, defaults to 0.1): + The ratio for all dropout layers. + layer_norm_eps (`float`, *optional*, defaults to 1e-6): + The epsilon used by the layer normalization layers. + initializer_factor (`float`, *optional*, defaults to 1): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + feed_forward_proj (`string`, *optional*, defaults to `"gated-gelu"`): + Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + """ + model_type = "umt5" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=250112, + d_model=512, + d_kv=64, + d_ff=1024, + num_layers=8, + num_decoder_layers=None, + num_heads=6, + relative_attention_num_buckets=32, + relative_attention_max_distance=128, + dropout_rate=0.1, + layer_norm_epsilon=1e-6, + initializer_factor=1.0, + feed_forward_proj="gated-gelu", + is_encoder_decoder=True, + use_cache=True, + tokenizer_class="T5Tokenizer", + tie_word_embeddings=True, + pad_token_id=0, + eos_token_id=1, + decoder_start_token_id=0, + **kwargs, + ): + super().__init__( + is_encoder_decoder=is_encoder_decoder, + tokenizer_class=tokenizer_class, + tie_word_embeddings=tie_word_embeddings, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + decoder_start_token_id=decoder_start_token_id, + **kwargs, + ) + self.vocab_size = vocab_size + self.d_model = d_model + self.d_kv = d_kv + self.d_ff = d_ff + self.num_layers = num_layers + self.num_decoder_layers = ( + num_decoder_layers if num_decoder_layers is not None else self.num_layers + ) # default = symmetry + self.num_heads = num_heads + self.relative_attention_num_buckets = relative_attention_num_buckets + self.relative_attention_max_distance = relative_attention_max_distance + self.dropout_rate = dropout_rate + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_factor = initializer_factor + self.feed_forward_proj = feed_forward_proj + self.use_cache = use_cache + + act_info = self.feed_forward_proj.split("-") + self.dense_act_fn = act_info[-1] + self.is_gated_act = act_info[0] == "gated" + + if len(act_info) > 1 and act_info[0] != "gated" or len(act_info) > 2: + raise ValueError( + f"`feed_forward_proj`: {feed_forward_proj} is not a valid activation function of the dense layer." + "Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. " + "'gated-gelu' or 'relu'" + ) + + if feed_forward_proj == "gated-gelu": + self.dense_act_fn = "gelu_new" + + @property + def hidden_size(self): + return self.d_model + + @property + def num_attention_heads(self): + return self.num_heads + + @property + def num_hidden_layers(self): + return self.num_layers + + +class UMT5OnnxConfig(OnnxSeq2SeqConfigWithPast): + @property + # Copied from transformers.models.t5.configuration_t5.T5OnnxConfig.inputs + def inputs(self) -> Mapping[str, Mapping[int, str]]: + common_inputs = { + "input_ids": {0: "batch", 1: "encoder_sequence"}, + "attention_mask": {0: "batch", 1: "encoder_sequence"}, + } + if self.use_past: + common_inputs["attention_mask"][1] = "past_encoder_sequence + sequence" + common_inputs["decoder_input_ids"] = {0: "batch"} + common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"} + else: + common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"} + common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"} + + if self.use_past: + self.fill_with_past_key_values_(common_inputs, direction="inputs") + + return common_inputs + + @property + # Copied from transformers.models.t5.configuration_t5.T5OnnxConfig.default_onnx_opset + def default_onnx_opset(self) -> int: + return 13 + + @property + def atol_for_validation(self) -> float: + return 5e-4 diff --git a/src/transformers/models/umt5/modeling_umt5.py b/src/transformers/models/umt5/modeling_umt5.py index e3dc40d592..ba3bca7875 100644 --- a/src/transformers/models/umt5/modeling_umt5.py +++ b/src/transformers/models/umt5/modeling_umt5.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" PyTorch mT5 model.""" +""" PyTorch UMT5 model.""" import copy import math @@ -24,7 +24,6 @@ from torch.nn import CrossEntropyLoss from torch.utils.checkpoint import checkpoint from ...activations import ACT2FN -from ...configuration_utils import PretrainedConfig from ...modeling_outputs import ( BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, @@ -42,6 +41,7 @@ from ...utils import ( logging, replace_return_docstrings, ) +from .configuration_umt5 import UMT5Config logger = logging.get_logger(__name__) @@ -76,9 +76,9 @@ class UMT5LayerNorm(nn.Module): return self.weight * hidden_states -# Copied from transformers.models.t5.modeling_t5.T5DenseActDense with T5->UMT5,UMT5Config->PretrainedConfig +# Copied from transformers.models.t5.modeling_t5.T5DenseActDense with T5->UMT5 class UMT5DenseActDense(nn.Module): - def __init__(self, config: PretrainedConfig): + def __init__(self, config: UMT5Config): super().__init__() self.wi = nn.Linear(config.d_model, config.d_ff, bias=False) self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) @@ -99,9 +99,9 @@ class UMT5DenseActDense(nn.Module): return hidden_states -# Copied from transformers.models.t5.modeling_t5.T5DenseGatedActDense with T5->UMT5,UMT5Config->PretrainedConfig +# Copied from transformers.models.t5.modeling_t5.T5DenseGatedActDense with T5->UMT5 class UMT5DenseGatedActDense(nn.Module): - def __init__(self, config: PretrainedConfig): + def __init__(self, config: UMT5Config): super().__init__() self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False) self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False) @@ -129,9 +129,9 @@ class UMT5DenseGatedActDense(nn.Module): return hidden_states -# Copied from transformers.models.t5.modeling_t5.T5LayerFF with T5->UMT5,UMT5Config->PretrainedConfig +# Copied from transformers.models.t5.modeling_t5.T5LayerFF with T5->UMT5 class UMT5LayerFF(nn.Module): - def __init__(self, config: PretrainedConfig): + def __init__(self, config: UMT5Config): super().__init__() if config.is_gated_act: self.DenseReluDense = UMT5DenseGatedActDense(config) @@ -457,7 +457,7 @@ class UMT5PreTrainedModel(PreTrainedModel): models. """ - config_class = PretrainedConfig + config_class = UMT5Config base_model_prefix = "transformer" supports_gradient_checkpointing = True _no_split_modules = ["UMT5Block"] @@ -916,7 +916,7 @@ class UMT5Model(UMT5PreTrainedModel): >>> hidden_states = outputs.last_hidden_state ```""" model_type = "uumt5" - config_class = PretrainedConfig + config_class = UMT5Config _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config): diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py index 697e619f13..8bdab8ca73 100644 --- a/tests/models/umt5/test_modeling_umt5.py +++ b/tests/models/umt5/test_modeling_umt5.py @@ -35,7 +35,7 @@ if is_torch_available(): from transformers import AutoTokenizer, UMT5ForConditionalGeneration, UMT5ForQuestionAnswering, UMT5Model -# Copied from test.models.t5.test_modeling_t5.T5ModelTester with T5->UMT5,UMT5Config->T5Config +# Copied from test.models.t5.test_modeling_t5.T5ModelTester with T5->UMT5 class UMT5ModelTester: def __init__( self, diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py index dad6888c80..1175f07740 100644 --- a/utils/check_config_attributes.py +++ b/utils/check_config_attributes.py @@ -54,6 +54,7 @@ SPECIAL_CASES_TO_ALLOW = { # used internally in the configuration class file # `tokenizer_class` get default value `T5Tokenizer` intentionally "MT5Config": ["feed_forward_proj", "tokenizer_class"], + "UMT5Config": ["feed_forward_proj", "tokenizer_class"], # used internally in the configuration class file "LongT5Config": ["feed_forward_proj"], # used internally in the configuration class file