From 879e1d32346cc4d24fb54bb336dcd5db4cd7005d Mon Sep 17 00:00:00 2001 From: Julien Plu Date: Mon, 16 Mar 2020 14:29:21 +0100 Subject: [PATCH] Add TF2 version of FlauBERT (#2700) * Add TF2 version of FlauBERT * Add TF2 version of FlauBERT * Add documentation * Apply style and quality * Apply style once again Co-authored-by: Lysandre Debut --- src/transformers/__init__.py | 15 +- .../convert_pytorch_checkpoint_to_tf2.py | 16 + src/transformers/modeling_tf_camembert.py | 2 +- src/transformers/modeling_tf_flaubert.py | 329 ++++++++++++++++++ 4 files changed, 353 insertions(+), 9 deletions(-) mode change 100644 => 100755 src/transformers/convert_pytorch_checkpoint_to_tf2.py create mode 100644 src/transformers/modeling_tf_flaubert.py diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 16b8315425..6d3699fc05 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -220,14 +220,6 @@ if is_torch_available(): RobertaForQuestionAnswering, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ) - from .modeling_camembert import ( - CamembertForMaskedLM, - CamembertModel, - CamembertForSequenceClassification, - CamembertForTokenClassification, - CamembertForQuestionAnswering, - CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP, - ) from .modeling_distilbert import ( DistilBertPreTrainedModel, DistilBertForMaskedLM, @@ -400,6 +392,13 @@ if is_tf_available(): TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP, ) + from .modeling_tf_flaubert import ( + TFFlaubertModel, + TFFlaubertWithLMHeadModel, + TFFlaubertForSequenceClassification, + TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + ) + from .modeling_tf_distilbert import ( TFDistilBertPreTrainedModel, TFDistilBertMainLayer, diff --git a/src/transformers/convert_pytorch_checkpoint_to_tf2.py b/src/transformers/convert_pytorch_checkpoint_to_tf2.py old mode 100644 new mode 100755 index 2ddbaa006a..4fb08e0f70 --- a/src/transformers/convert_pytorch_checkpoint_to_tf2.py +++ b/src/transformers/convert_pytorch_checkpoint_to_tf2.py @@ -25,6 +25,7 @@ from transformers import ( CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, + FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, @@ -38,6 +39,7 @@ from transformers import ( CamembertConfig, CTRLConfig, DistilBertConfig, + FlaubertConfig, GPT2Config, OpenAIGPTConfig, RobertaConfig, @@ -50,6 +52,7 @@ from transformers import ( TFCTRLLMHeadModel, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, + TFFlaubertWithLMHeadModel, TFGPT2LMHeadModel, TFOpenAIGPTLMHeadModel, TFRobertaForMaskedLM, @@ -95,6 +98,8 @@ if is_torch_available(): CamembertForMaskedLM, CamembertForSequenceClassification, CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + FlaubertWithLMHeadModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering, DistilBertForSequenceClassification, @@ -130,6 +135,8 @@ else: CamembertForMaskedLM, CamembertForSequenceClassification, CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + FlaubertWithLMHeadModel, DistilBertForMaskedLM, DistilBertForSequenceClassification, DistilBertForQuestionAnswering, @@ -173,6 +180,8 @@ else: None, None, None, + None, + None, ) @@ -270,6 +279,13 @@ MODEL_CLASSES = { CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ), + "flaubert": ( + FlaubertConfig, + TFFlaubertWithLMHeadModel, + FlaubertWithLMHeadModel, + FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, + ), "distilbert": ( DistilBertConfig, TFDistilBertForMaskedLM, diff --git a/src/transformers/modeling_tf_camembert.py b/src/transformers/modeling_tf_camembert.py index d6317cacfb..11318654c3 100644 --- a/src/transformers/modeling_tf_camembert.py +++ b/src/transformers/modeling_tf_camembert.py @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" TF 2.0 RoBERTa model. """ +""" TF 2.0 CamemBERT model. """ import logging diff --git a/src/transformers/modeling_tf_flaubert.py b/src/transformers/modeling_tf_flaubert.py new file mode 100644 index 0000000000..16706f1ba5 --- /dev/null +++ b/src/transformers/modeling_tf_flaubert.py @@ -0,0 +1,329 @@ +# coding=utf-8 +# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" TF 2.0 Flaubert model. +""" + +import logging +import random + +import tensorflow as tf + +from .configuration_flaubert import FlaubertConfig +from .file_utils import add_start_docstrings +from .modeling_tf_xlm import ( + TFXLMForSequenceClassification, + TFXLMMainLayer, + TFXLMModel, + TFXLMWithLMHeadModel, + get_masks, + shape_list, +) + + +logger = logging.getLogger(__name__) + +TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {} + +FLAUBERT_START_DOCSTRING = r""" + + This model is a `tf.keras.Model `__ sub-class. + Use it as a regular TF 2.0 Keras Model and + refer to the TF 2.0 documentation for all matter related to general usage and behavior. + + Parameters: + config (:class:`~transformers.FlaubertConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the configuration. + Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. +""" + +FLAUBERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + Indices can be obtained using :class:`transformers.BertTokenizer`. + See :func:`transformers.PreTrainedTokenizer.encode` and + :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + `What are attention masks? <../glossary.html#attention-mask>`__ + langs (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + A parallel sequence of tokens to be used to indicate the language of each token in the input. + Indices are languages ids which can be obtained from the language names by using two conversion mappings + provided in the configuration of the model (only provided for multilingual models). + More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and + the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str). + See usage examples detailed in the `multilingual documentation `__. + token_type_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Segment token indices to indicate first and second portions of the inputs. + Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` + corresponds to a `sentence B` token + `What are token type IDs? <../glossary.html#token-type-ids>`_ + position_ids (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Indices of positions of each input sequence tokens in the position embeddings. + Selected in the range ``[0, config.max_position_embeddings - 1]``. + `What are position IDs? <../glossary.html#position-ids>`_ + lengths (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Length of each sentence that can be used to avoid performing attention on padding token indices. + You can also use `attention_mask` for the same result (see above), kept here for compatbility. + Indices selected in ``[0, ..., input_ids.size(-1)]``: + cache (:obj:`Dict[str, tf.Tensor]`, `optional`, defaults to :obj:`None`): + dictionary with ``tf.Tensor`` that contains pre-computed + hidden-states (key and values in the attention blocks) as computed by the model + (see `cache` output below). Can be used to speed up sequential decoding. + The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states. + head_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): + Mask to nullify selected heads of the self-attention modules. + Mask values selected in ``[0, 1]``: + :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. + input_embeds (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. +""" + + +@add_start_docstrings( + "The bare Flaubert Model transformer outputting raw hidden-states without any specific head on top.", + FLAUBERT_START_DOCSTRING, +) +class TFFlaubertModel(TFXLMModel): + config_class = FlaubertConfig + pretrained_model_archive_map = TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP + + def __init__(self, config, *inputs, **kwargs): + super(TFFlaubertModel, self).__init__(config, *inputs, **kwargs) + self.transformer = TFFlaubertMainLayer(config, name="transformer") + + +class TFFlaubertMainLayer(TFXLMMainLayer): + def __init__(self, config, *inputs, **kwargs): + super(TFFlaubertMainLayer, self).__init__(config, *inputs, **kwargs) + self.layerdrop = getattr(config, "layerdrop", 0.0) + self.pre_norm = getattr(config, "pre_norm", False) + + def call( + self, + inputs, + attention_mask=None, + langs=None, + token_type_ids=None, + position_ids=None, + lengths=None, + cache=None, + head_mask=None, + inputs_embeds=None, + training=False, + ): + # removed: src_enc=None, src_len=None + if isinstance(inputs, (tuple, list)): + input_ids = inputs[0] + attention_mask = inputs[1] if len(inputs) > 1 else attention_mask + langs = inputs[2] if len(inputs) > 2 else langs + token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids + position_ids = inputs[4] if len(inputs) > 4 else position_ids + lengths = inputs[5] if len(inputs) > 5 else lengths + cache = inputs[6] if len(inputs) > 6 else cache + head_mask = inputs[7] if len(inputs) > 7 else head_mask + inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds + assert len(inputs) <= 9, "Too many inputs." + elif isinstance(inputs, dict): + input_ids = inputs.get("input_ids") + attention_mask = inputs.get("attention_mask", attention_mask) + langs = inputs.get("langs", langs) + token_type_ids = inputs.get("token_type_ids", token_type_ids) + position_ids = inputs.get("position_ids", position_ids) + lengths = inputs.get("lengths", lengths) + cache = inputs.get("cache", cache) + head_mask = inputs.get("head_mask", head_mask) + inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) + assert len(inputs) <= 9, "Too many inputs." + else: + input_ids = inputs + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + bs, slen = shape_list(input_ids) + elif inputs_embeds is not None: + bs, slen = shape_list(inputs_embeds)[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if lengths is None: + if input_ids is not None: + lengths = tf.reduce_sum(tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=tf.int32), axis=1) + else: + lengths = tf.convert_to_tensor([slen] * bs, tf.int32) + # mask = input_ids != self.pad_index + + # check inputs + # assert shape_list(lengths)[0] == bs + tf.debugging.assert_equal(shape_list(lengths)[0], bs) + # assert lengths.max().item() <= slen + # input_ids = input_ids.transpose(0, 1) # batch size as dimension 0 + # assert (src_enc is None) == (src_len is None) + # if src_enc is not None: + # assert self.is_decoder + # assert src_enc.size(0) == bs + + # generate masks + mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask) + # if self.is_decoder and src_enc is not None: + # src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None] + + # position_ids + if position_ids is None: + position_ids = tf.expand_dims(tf.range(slen), axis=0) + else: + # assert shape_list(position_ids) == [bs, slen] # (slen, bs) + tf.debugging.assert_equal(shape_list(position_ids), [bs, slen]) + # position_ids = position_ids.transpose(0, 1) + + # langs + if langs is not None: + # assert shape_list(langs) == [bs, slen] # (slen, bs) + tf.debugging.assert_equal(shape_list(langs), [bs, slen]) + # langs = langs.transpose(0, 1) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x qlen x klen] + if head_mask is not None: + raise NotImplementedError + else: + head_mask = [None] * self.n_layers + + # do not recompute cached elements + if cache is not None and input_ids is not None: + _slen = slen - cache["slen"] + input_ids = input_ids[:, -_slen:] + position_ids = position_ids[:, -_slen:] + if langs is not None: + langs = langs[:, -_slen:] + mask = mask[:, -_slen:] + attn_mask = attn_mask[:, -_slen:] + + # embeddings + if inputs_embeds is None: + inputs_embeds = self.embeddings(input_ids) + + tensor = inputs_embeds + self.position_embeddings(position_ids) + if langs is not None and self.use_lang_emb: + tensor = tensor + self.lang_embeddings(langs) + if token_type_ids is not None: + tensor = tensor + self.embeddings(token_type_ids) + tensor = self.layer_norm_emb(tensor) + tensor = self.dropout(tensor, training=training) + tensor = tensor * mask[..., tf.newaxis] + + # transformer layers + hidden_states = () + attentions = () + for i in range(self.n_layers): + # LayerDrop + dropout_probability = random.uniform(0, 1) + if training and (dropout_probability < self.layerdrop): + continue + + if self.output_hidden_states: + hidden_states = hidden_states + (tensor,) + + # self attention + if not self.pre_norm: + attn_outputs = self.attentions[i]([tensor, attn_mask, None, cache, head_mask[i]], training=training) + attn = attn_outputs[0] + if self.output_attentions: + attentions = attentions + (attn_outputs[1],) + attn = self.dropout(attn, training=training) + tensor = tensor + attn + tensor = self.layer_norm1[i](tensor) + else: + tensor_normalized = self.layer_norm1[i](tensor) + attn_outputs = self.attentions[i]( + [tensor_normalized, attn_mask, None, cache, head_mask[i]], training=training + ) + attn = attn_outputs[0] + if self.output_attentions: + attentions = attentions + (attn_outputs[1],) + attn = self.dropout(attn, training=training) + tensor = tensor + attn + + # encoder attention (for decoder only) + # if self.is_decoder and src_enc is not None: + # attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache) + # attn = F.dropout(attn, p=self.dropout, training=self.training) + # tensor = tensor + attn + # tensor = self.layer_norm15[i](tensor) + + # FFN + if not self.pre_norm: + tensor = tensor + self.ffns[i](tensor) + tensor = self.layer_norm2[i](tensor) + else: + tensor_normalized = self.layer_norm2[i](tensor) + tensor = tensor + self.ffns[i](tensor_normalized) + + tensor = tensor * mask[..., tf.newaxis] + + # Add last hidden state + if self.output_hidden_states: + hidden_states = hidden_states + (tensor,) + + # update cache length + if cache is not None: + cache["slen"] += tensor.size(1) + + # move back sequence length to dimension 0 + # tensor = tensor.transpose(0, 1) + + outputs = (tensor,) + if self.output_hidden_states: + outputs = outputs + (hidden_states,) + if self.output_attentions: + outputs = outputs + (attentions,) + return outputs # outputs, (hidden_states), (attentions) + + +@add_start_docstrings( + """The Flaubert Model transformer with a language modeling head on top + (linear layer with weights tied to the input embeddings). """, + FLAUBERT_START_DOCSTRING, +) +class TFFlaubertWithLMHeadModel(TFXLMWithLMHeadModel): + config_class = FlaubertConfig + pretrained_model_archive_map = TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP + + def __init__(self, config, *inputs, **kwargs): + super(TFFlaubertWithLMHeadModel, self).__init__(config, *inputs, **kwargs) + self.transformer = TFFlaubertMainLayer(config, name="transformer") + + +@add_start_docstrings( + """Flaubert Model with a sequence classification/regression head on top (a linear layer on top of + the pooled output) e.g. for GLUE tasks. """, + FLAUBERT_START_DOCSTRING, +) +class TFFlaubertForSequenceClassification(TFXLMForSequenceClassification): + config_class = FlaubertConfig + pretrained_model_archive_map = TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP + + def __init__(self, config, *inputs, **kwargs): + super(TFFlaubertForSequenceClassification, self).__init__(config, *inputs, **kwargs) + self.transformer = TFFlaubertMainLayer(config, name="transformer")