From 155c782a2ccd103cf63ad48a2becd7c76a7d2115 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Mon, 11 Nov 2019 22:19:14 -0500 Subject: [PATCH] [inputs_embeds] All TF models + tests --- transformers/modeling_tf_bert.py | 59 +++++++++++++------ transformers/modeling_tf_ctrl.py | 27 ++++++--- transformers/modeling_tf_distilbert.py | 38 ++++++++---- transformers/modeling_tf_gpt2.py | 44 +++++++++----- transformers/modeling_tf_openai.py | 44 +++++++++----- transformers/modeling_tf_roberta.py | 10 +++- transformers/modeling_tf_transfo_xl.py | 53 +++++++++++------ transformers/modeling_tf_xlm.py | 31 +++++++--- transformers/modeling_tf_xlnet.py | 26 ++++++-- transformers/tests/modeling_tf_bert_test.py | 4 -- transformers/tests/modeling_tf_common_test.py | 21 +++++++ 11 files changed, 252 insertions(+), 105 deletions(-) diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py index 66d5efd87c..ad0815e2ca 100644 --- a/transformers/modeling_tf_bert.py +++ b/transformers/modeling_tf_bert.py @@ -142,19 +142,25 @@ class TFBertEmbeddings(tf.keras.layers.Layer): def _embedding(self, inputs, training=False): """Applies embedding based on inputs tensor.""" - input_ids, position_ids, token_type_ids = inputs + input_ids, position_ids, token_type_ids, inputs_embeds = inputs - seq_length = tf.shape(input_ids)[1] + if input_ids is not None: + input_shape = tf.shape(input_ids) + else: + input_shape = tf.shape(inputs_embeds)[:-1] + + seq_length = input_shape[1] if position_ids is None: position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] if token_type_ids is None: - token_type_ids = tf.fill(tf.shape(input_ids), 0) + token_type_ids = tf.fill(input_shape, 0) - words_embeddings = tf.gather(self.word_embeddings, input_ids) + if inputs_embeds is None: + inputs_embeds = tf.gather(self.word_embeddings, input_ids) position_embeddings = self.position_embeddings(position_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = words_embeddings + position_embeddings + token_type_embeddings + embeddings = inputs_embeds + position_embeddings + token_type_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings, training=training) return embeddings @@ -473,28 +479,39 @@ class TFBertMainLayer(tf.keras.layers.Layer): """ raise NotImplementedError - def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False): + def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids head_mask = inputs[4] if len(inputs) > 4 else head_mask - assert len(inputs) <= 5, "Too many inputs." + inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds + assert len(inputs) <= 6, "Too many inputs." elif isinstance(inputs, dict): input_ids = inputs.get('input_ids') attention_mask = inputs.get('attention_mask', attention_mask) token_type_ids = inputs.get('token_type_ids', token_type_ids) position_ids = inputs.get('position_ids', position_ids) head_mask = inputs.get('head_mask', head_mask) - assert len(inputs) <= 5, "Too many inputs." + inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) + assert len(inputs) <= 6, "Too many inputs." else: input_ids = inputs + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.shape + elif inputs_embeds is not None: + input_shape = inputs_embeds.shape[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + if attention_mask is None: - attention_mask = tf.fill(tf.shape(input_ids), 1) + attention_mask = tf.fill(input_shape, 1) if token_type_ids is None: - token_type_ids = tf.fill(tf.shape(input_ids), 0) + token_type_ids = tf.fill(input_shape, 0) # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] @@ -523,7 +540,7 @@ class TFBertMainLayer(tf.keras.layers.Layer): head_mask = [None] * self.num_hidden_layers # head_mask = tf.constant([0] * self.num_hidden_layers) - embedding_output = self.embeddings([input_ids, position_ids, token_type_ids], training=training) + embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training) sequence_output = encoder_outputs[0] @@ -901,33 +918,39 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel): kernel_initializer=get_initializer(config.initializer_range), name='classifier') - def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False): + def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids head_mask = inputs[4] if len(inputs) > 4 else head_mask - assert len(inputs) <= 5, "Too many inputs." + inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds + assert len(inputs) <= 6, "Too many inputs." elif isinstance(inputs, dict): input_ids = inputs.get('input_ids') attention_mask = inputs.get('attention_mask', attention_mask) token_type_ids = inputs.get('token_type_ids', token_type_ids) position_ids = inputs.get('position_ids', position_ids) head_mask = inputs.get('head_mask', head_mask) - assert len(inputs) <= 5, "Too many inputs." + inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) + assert len(inputs) <= 6, "Too many inputs." else: input_ids = inputs - num_choices = tf.shape(input_ids)[1] - seq_length = tf.shape(input_ids)[2] + if input_ids is not None: + num_choices = tf.shape(input_ids)[1] + seq_length = tf.shape(input_ids)[2] + else: + num_choices = tf.shape(inputs_embeds)[1] + seq_length = tf.shape(inputs_embeds)[2] - flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) + flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None - flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask] + flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds] outputs = self.bert(flat_inputs, training=training) diff --git a/transformers/modeling_tf_ctrl.py b/transformers/modeling_tf_ctrl.py index 99738a8b14..ae66dbc82c 100644 --- a/transformers/modeling_tf_ctrl.py +++ b/transformers/modeling_tf_ctrl.py @@ -204,7 +204,7 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): """ raise NotImplementedError - def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False): + def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] past = inputs[1] if len(inputs) > 1 else past @@ -212,7 +212,8 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids position_ids = inputs[4] if len(inputs) > 4 else position_ids head_mask = inputs[5] if len(inputs) > 5 else head_mask - assert len(inputs) <= 6, "Too many inputs." + inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds + assert len(inputs) <= 7, "Too many inputs." elif isinstance(inputs, dict): input_ids = inputs.get('input_ids') past = inputs.get('past', past) @@ -220,12 +221,20 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): token_type_ids = inputs.get('token_type_ids', token_type_ids) position_ids = inputs.get('position_ids', position_ids) head_mask = inputs.get('head_mask', head_mask) - assert len(inputs) <= 6, "Too many inputs." + inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) + assert len(inputs) <= 7, "Too many inputs." else: input_ids = inputs - input_shape = shape_list(input_ids) - input_ids = tf.reshape(input_ids, [-1, input_shape[-1]]) + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + input_ids = tf.reshape(input_ids, [-1, input_shape[-1]]) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") if past is None: past_length = 0 @@ -233,8 +242,8 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): else: past_length = shape_list(past[0][0])[-2] if position_ids is None: - position_ids = tf.range(past_length, shape_list(input_ids)[-1] + past_length, dtype=tf.int32)[tf.newaxis, :] - position_ids = tf.tile(position_ids, [shape_list(input_ids)[0], 1]) + position_ids = tf.range(past_length, input_shape[-1] + past_length, dtype=tf.int32)[tf.newaxis, :] + position_ids = tf.tile(position_ids, [input_shape[0], 1]) # Attention mask. if attention_mask is not None: @@ -273,8 +282,8 @@ class TFCTRLMainLayer(tf.keras.layers.Layer): token_type_embeds = 0 position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]]) - inputs_embeds = self.w(input_ids, mode='embedding') - # x = embedded.unsqueeze(0) if len(input_ids.shape)<2 else embedded + if inputs_embeds is None: + inputs_embeds = self.w(input_ids, mode='embedding') seq_len = input_shape[-1] mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0) diff --git a/transformers/modeling_tf_distilbert.py b/transformers/modeling_tf_distilbert.py index 4b1f3e676b..6d393bb95d 100644 --- a/transformers/modeling_tf_distilbert.py +++ b/transformers/modeling_tf_distilbert.py @@ -96,7 +96,7 @@ class TFEmbeddings(tf.keras.layers.Layer): initializer=get_initializer(self.initializer_range)) super(TFEmbeddings, self).build(input_shape) - def call(self, inputs, mode="embedding", training=False): + def call(self, inputs, inputs_embeds=None, mode="embedding", training=False): """Get token embeddings of inputs. Args: inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) @@ -112,13 +112,13 @@ class TFEmbeddings(tf.keras.layers.Layer): https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ if mode == "embedding": - return self._embedding(inputs, training=training) + return self._embedding(inputs, inputs_embeds=inputs_embeds, training=training) elif mode == "linear": return self._linear(inputs) else: raise ValueError("mode {} is not valid.".format(mode)) - def _embedding(self, inputs, training=False): + def _embedding(self, inputs, inputs_embeds=None, training=False): """ Parameters ---------- @@ -136,14 +136,19 @@ class TFEmbeddings(tf.keras.layers.Layer): else: input_ids, position_ids = inputs - seq_length = tf.shape(input_ids)[1] + if input_ids is not None: + seq_length = tf.shape(input_ids)[1] + else: + seq_length = tf.shape(inputs_embeds)[1] + if position_ids is None: position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] - word_embeddings = tf.gather(self.word_embeddings, input_ids) + if inputs_embeds is None: + inputs_embeds = tf.gather(self.word_embeddings, input_ids) position_embeddings = self.position_embeddings(position_ids) # (bs, max_seq_length, dim) - embeddings = word_embeddings + position_embeddings # (bs, max_seq_length, dim) + embeddings = inputs_embeds + position_embeddings # (bs, max_seq_length, dim) embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim) embeddings = self.dropout(embeddings, training=training) # (bs, max_seq_length, dim) return embeddings @@ -407,22 +412,33 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer): def _prune_heads(self, heads_to_prune): raise NotImplementedError - def call(self, inputs, attention_mask=None, head_mask=None, training=False): + def call(self, inputs, attention_mask=None, head_mask=None, inputs_embeds=None, training=False): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask head_mask = inputs[2] if len(inputs) > 2 else head_mask - assert len(inputs) <= 3, "Too many inputs." + inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds + assert len(inputs) <= 4, "Too many inputs." elif isinstance(inputs, dict): input_ids = inputs.get('input_ids') attention_mask = inputs.get('attention_mask', attention_mask) head_mask = inputs.get('head_mask', head_mask) - assert len(inputs) <= 3, "Too many inputs." + inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) + assert len(inputs) <= 4, "Too many inputs." else: input_ids = inputs + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + if attention_mask is None: - attention_mask = tf.ones(shape_list(input_ids)) # (bs, seq_length) + attention_mask = tf.ones(input_shape) # (bs, seq_length) attention_mask = tf.cast(attention_mask, dtype=tf.float32) # Prepare head mask if needed @@ -435,7 +451,7 @@ class TFDistilBertMainLayer(tf.keras.layers.Layer): else: head_mask = [None] * self.num_hidden_layers - embedding_output = self.embeddings(input_ids) # (bs, seq_length, dim) + embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds) # (bs, seq_length, dim) tfmr_output = self.transformer([embedding_output, attention_mask, head_mask], training=training) return tfmr_output # last-layer hidden-state, (all hidden_states), (all attentions) diff --git a/transformers/modeling_tf_gpt2.py b/transformers/modeling_tf_gpt2.py index 23866a1a0a..5e416a5e3a 100644 --- a/transformers/modeling_tf_gpt2.py +++ b/transformers/modeling_tf_gpt2.py @@ -231,7 +231,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): """ raise NotImplementedError - def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False): + def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] past = inputs[1] if len(inputs) > 1 else past @@ -239,7 +239,8 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids position_ids = inputs[4] if len(inputs) > 4 else position_ids head_mask = inputs[5] if len(inputs) > 5 else head_mask - assert len(inputs) <= 6, "Too many inputs." + inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds + assert len(inputs) <= 7, "Too many inputs." elif isinstance(inputs, dict): input_ids = inputs.get('input_ids') past = inputs.get('past', past) @@ -247,17 +248,28 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): token_type_ids = inputs.get('token_type_ids', token_type_ids) position_ids = inputs.get('position_ids', position_ids) head_mask = inputs.get('head_mask', head_mask) - assert len(inputs) <= 6, "Too many inputs." + inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) + assert len(inputs) <= 7, "Too many inputs." else: input_ids = inputs + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + input_ids = tf.reshape(input_ids, [-1, input_shape[-1]]) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + if past is None: past_length = 0 past = [None] * len(self.h) else: past_length = shape_list(past[0][0])[-2] if position_ids is None: - position_ids = tf.range(past_length, shape_list(input_ids)[-1] + past_length, dtype=tf.int32)[tf.newaxis, :] + position_ids = tf.range(past_length, input_shape[-1] + past_length, dtype=tf.int32)[tf.newaxis, :] if attention_mask is not None: # We create a 3D attention mask from a 2D tensor mask. @@ -289,11 +301,10 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): head_mask = [None] * self.num_hidden_layers # head_mask = tf.constant([0] * self.num_hidden_layers) - input_shape = shape_list(input_ids) - input_ids = tf.reshape(input_ids, [-1, input_shape[-1]]) position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]]) - inputs_embeds = self.wte(input_ids, mode='embedding') + if inputs_embeds is None: + inputs_embeds = self.wte(input_ids, mode='embedding') position_embeds = self.wpe(position_ids) if token_type_ids is not None: token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]]) @@ -569,7 +580,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): def get_output_embeddings(self): return self.transformer.wte - def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, mc_token_ids=None, training=False): + def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, training=False): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] past = inputs[1] if len(inputs) > 1 else past @@ -577,8 +588,9 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids position_ids = inputs[4] if len(inputs) > 4 else position_ids head_mask = inputs[5] if len(inputs) > 5 else head_mask - mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids - assert len(inputs) <= 7, "Too many inputs." + inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds + mc_token_ids = inputs[7] if len(inputs) > 7 else mc_token_ids + assert len(inputs) <= 8, "Too many inputs." elif isinstance(inputs, dict): input_ids = inputs.get('input_ids') past = inputs.get('past', past) @@ -586,21 +598,25 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): token_type_ids = inputs.get('token_type_ids', token_type_ids) position_ids = inputs.get('position_ids', position_ids) head_mask = inputs.get('head_mask', head_mask) + inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) mc_token_ids = inputs.get('mc_token_ids', mc_token_ids) - assert len(inputs) <= 7, "Too many inputs." + assert len(inputs) <= 8, "Too many inputs." else: input_ids = inputs - input_shapes = shape_list(input_ids) + if input_ids is not None: + input_shapes = shape_list(input_ids) + else: + input_shapes = shape_list(inputs_embeds)[:-1] seq_length = input_shapes[-1] - flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) + flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None - flat_inputs = [flat_input_ids, past, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask] + flat_inputs = [flat_input_ids, past, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds] transformer_outputs = self.transformer(flat_inputs, training=training) hidden_states = transformer_outputs[0] diff --git a/transformers/modeling_tf_openai.py b/transformers/modeling_tf_openai.py index bddd9338b1..c553d92317 100644 --- a/transformers/modeling_tf_openai.py +++ b/transformers/modeling_tf_openai.py @@ -229,26 +229,38 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): """ raise NotImplementedError - def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False): + def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, training=False): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids head_mask = inputs[4] if len(inputs) > 4 else head_mask - assert len(inputs) <= 5, "Too many inputs." + inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds + assert len(inputs) <= 6, "Too many inputs." elif isinstance(inputs, dict): input_ids = inputs.get('input_ids') attention_mask = inputs.get('attention_mask', attention_mask) token_type_ids = inputs.get('token_type_ids', token_type_ids) position_ids = inputs.get('position_ids', position_ids) head_mask = inputs.get('head_mask', head_mask) - assert len(inputs) <= 5, "Too many inputs." + inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) + assert len(inputs) <= 6, "Too many inputs." else: input_ids = inputs + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + input_ids = tf.reshape(input_ids, [-1, input_shape[-1]]) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + if position_ids is None: - position_ids = tf.range(shape_list(input_ids)[-1], dtype=tf.int32)[tf.newaxis, :] + position_ids = tf.range(input_shape[-1], dtype=tf.int32)[tf.newaxis, :] if attention_mask is not None: # We create a 3D attention mask from a 2D tensor mask. @@ -280,11 +292,10 @@ class TFOpenAIGPTMainLayer(tf.keras.layers.Layer): head_mask = [None] * self.num_hidden_layers # head_mask = tf.constant([0] * self.num_hidden_layers) - input_shape = shape_list(input_ids) - input_ids = tf.reshape(input_ids, [-1, input_shape[-1]]) position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]]) - inputs_embeds = self.tokens_embed(input_ids, mode='embedding') + if inputs_embeds is None: + inputs_embeds = self.tokens_embed(input_ids, mode='embedding') position_embeds = self.positions_embed(position_ids) if token_type_ids is not None: token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]]) @@ -533,36 +544,41 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): def get_output_embeddings(self): return self.transformer.tokens_embed - def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, mc_token_ids=None, training=False): + def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, training=False): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids position_ids = inputs[3] if len(inputs) > 3 else position_ids head_mask = inputs[4] if len(inputs) > 4 else head_mask - mc_token_ids = inputs[5] if len(inputs) > 5 else mc_token_ids - assert len(inputs) <= 6, "Too many inputs." + inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds + mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids + assert len(inputs) <= 7, "Too many inputs." elif isinstance(inputs, dict): input_ids = inputs.get('input_ids') attention_mask = inputs.get('attention_mask', attention_mask) token_type_ids = inputs.get('token_type_ids', token_type_ids) position_ids = inputs.get('position_ids', position_ids) head_mask = inputs.get('head_mask', head_mask) + inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) mc_token_ids = inputs.get('mc_token_ids', mc_token_ids) - assert len(inputs) <= 6, "Too many inputs." + assert len(inputs) <= 7, "Too many inputs." else: input_ids = inputs - input_shapes = shape_list(input_ids) + if input_ids is not None: + input_shapes = shape_list(input_ids) + else: + input_shapes = shape_list(inputs_embeds)[:-1] seq_length = input_shapes[-1] - flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) + flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None - flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask] + flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask, inputs_embeds] transformer_outputs = self.transformer(flat_inputs, training=training) hidden_states = transformer_outputs[0] diff --git a/transformers/modeling_tf_roberta.py b/transformers/modeling_tf_roberta.py index c335910dc6..450c0c72f2 100644 --- a/transformers/modeling_tf_roberta.py +++ b/transformers/modeling_tf_roberta.py @@ -48,13 +48,17 @@ class TFRobertaEmbeddings(TFBertEmbeddings): def _embedding(self, inputs, training=False): """Applies embedding based on inputs tensor.""" - input_ids, position_ids, token_type_ids = inputs + input_ids, position_ids, token_type_ids, inputs_embeds = inputs + + if input_ids is not None: + seq_length = tf.shape(input_ids)[1] + else: + seq_length = tf.shape(inputs_embeds)[1] - seq_length = tf.shape(input_ids)[1] if position_ids is None: position_ids = tf.range(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=tf.int32)[tf.newaxis, :] - return super(TFRobertaEmbeddings, self)._embedding([input_ids, position_ids, token_type_ids], training=training) + return super(TFRobertaEmbeddings, self)._embedding([input_ids, position_ids, token_type_ids, inputs_embeds], training=training) class TFRobertaMainLayer(TFBertMainLayer): diff --git a/transformers/modeling_tf_transfo_xl.py b/transformers/modeling_tf_transfo_xl.py index 8c2a35b352..8a8d11cfbc 100644 --- a/transformers/modeling_tf_transfo_xl.py +++ b/transformers/modeling_tf_transfo_xl.py @@ -430,11 +430,11 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): def _prune_heads(self, heads): raise NotImplementedError - def init_mems(self, data): + def init_mems(self, bsz): if self.mem_len > 0: mems = [] for i in range(self.n_layer): - empty = tf.zeros([self.mem_len, shape_list(data)[1], self.d_model]) + empty = tf.zeros([self.mem_len, bsz, self.d_model]) mems.append(empty) return mems @@ -464,28 +464,37 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): return new_mems - def call(self, inputs, mems=None, head_mask=None, training=False): + def call(self, inputs, mems=None, head_mask=None, inputs_embeds=None, training=False): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] mems = inputs[1] if len(inputs) > 1 else mems head_mask = inputs[2] if len(inputs) > 2 else head_mask - assert len(inputs) <= 3, "Too many inputs." + inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds + assert len(inputs) <= 4, "Too many inputs." elif isinstance(inputs, dict): input_ids = inputs.get('input_ids') mems = inputs.get('mems', mems) head_mask = inputs.get('head_mask', head_mask) - assert len(inputs) <= 3, "Too many inputs." + inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) + assert len(inputs) <= 4, "Too many inputs." else: input_ids = inputs # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library # so we transpose here from shape [bsz, len] to shape [len, bsz] - input_ids = tf.transpose(input_ids, perm=(1, 0)) + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_ids = tf.transpose(input_ids, perm=(1, 0)) + qlen, bsz = shape_list(input_ids) + elif inputs_embeds is not None: + inputs_embeds = tf.transpose(inputs_embeds, perm=(1, 0, 2)) + qlen, bsz = shape_list(inputs_embeds)[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") if mems is None: - mems = self.init_mems(input_ids) - - qlen, bsz = shape_list(input_ids) + mems = self.init_mems(bsz) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head @@ -497,7 +506,10 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): else: head_mask = [None] * self.n_layer - word_emb = self.word_emb(input_ids) + if inputs_embeds is not None: + word_emb = inputs_embeds + else: + word_emb = self.word_emb(input_ids) mlen = shape_list(mems[0])[0] if mems is not None else 0 klen = mlen + qlen @@ -723,28 +735,33 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel): def reset_length(self, tgt_len, ext_len, mem_len): self.transformer.reset_length(tgt_len, ext_len, mem_len) - def init_mems(self, data): - return self.transformer.init_mems(data) + def init_mems(self, bsz): + return self.transformer.init_mems(bsz) - def call(self, inputs, mems=None, head_mask=None, labels=None, training=False): + def call(self, inputs, mems=None, head_mask=None, inputs_embeds=None, labels=None, training=False): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] mems = inputs[1] if len(inputs) > 1 else mems head_mask = inputs[2] if len(inputs) > 2 else head_mask - labels = inputs[3] if len(inputs) > 3 else labels - assert len(inputs) <= 4, "Too many inputs." + inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds + labels = inputs[4] if len(inputs) > 4 else labels + assert len(inputs) <= 5, "Too many inputs." elif isinstance(inputs, dict): input_ids = inputs.get('input_ids') mems = inputs.get('mems', mems) head_mask = inputs.get('head_mask', head_mask) + inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) labels = inputs.get('labels', labels) - assert len(inputs) <= 4, "Too many inputs." + assert len(inputs) <= 5, "Too many inputs." else: input_ids = inputs - bsz, tgt_len = shape_list(input_ids)[:2] + if input_ids is not None: + bsz, tgt_len = shape_list(input_ids)[:2] + else: + bsz, tgt_len = shape_list(inputs_embeds)[:2] - transformer_outputs = self.transformer([input_ids, mems, head_mask], training=training) + transformer_outputs = self.transformer([input_ids, mems, head_mask, inputs_embeds], training=training) last_hidden = transformer_outputs[0] pred_hid = last_hidden[:, -tgt_len:] diff --git a/transformers/modeling_tf_xlm.py b/transformers/modeling_tf_xlm.py index 20fbdca732..6f11b0537d 100644 --- a/transformers/modeling_tf_xlm.py +++ b/transformers/modeling_tf_xlm.py @@ -291,7 +291,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer): raise NotImplementedError def call(self, inputs, attention_mask=None, langs=None, token_type_ids=None, - position_ids=None, lengths=None, cache=None, head_mask=None, + position_ids=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None, training=False): # removed: src_enc=None, src_len=None if isinstance(inputs, (tuple, list)): input_ids = inputs[0] @@ -302,7 +302,8 @@ class TFXLMMainLayer(tf.keras.layers.Layer): lengths = inputs[5] if len(inputs) > 5 else lengths cache = inputs[6] if len(inputs) > 6 else cache head_mask = inputs[7] if len(inputs) > 7 else head_mask - assert len(inputs) <= 8, "Too many inputs." + inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds + assert len(inputs) <= 9, "Too many inputs." elif isinstance(inputs, dict): input_ids = inputs.get('input_ids') attention_mask = inputs.get('attention_mask', attention_mask) @@ -312,16 +313,28 @@ class TFXLMMainLayer(tf.keras.layers.Layer): lengths = inputs.get('lengths', lengths) cache = inputs.get('cache', cache) head_mask = inputs.get('head_mask', head_mask) - assert len(inputs) <= 8, "Too many inputs." + inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) + assert len(inputs) <= 9, "Too many inputs." else: input_ids = inputs + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + bs, slen = shape_list(input_ids) + elif inputs_embeds is not None: + bs, slen = shape_list(inputs_embeds)[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + if lengths is None: - lengths = tf.reduce_sum(tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=tf.int32), axis=1) + if input_ids is not None: + lengths = tf.reduce_sum(tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=tf.int32), axis=1) + else: + lengths = tf.convert_to_tensor([slen]*bs, tf.int32) # mask = input_ids != self.pad_index # check inputs - bs, slen = shape_list(input_ids) # assert shape_list(lengths)[0] == bs tf.debugging.assert_equal(shape_list(lengths)[0], bs) # assert lengths.max().item() <= slen @@ -361,7 +374,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer): head_mask = [None] * self.n_layers # do not recompute cached elements - if cache is not None: + if cache is not None and input_ids is not None: _slen = slen - cache['slen'] input_ids = input_ids[:, -_slen:] position_ids = position_ids[:, -_slen:] @@ -371,8 +384,10 @@ class TFXLMMainLayer(tf.keras.layers.Layer): attn_mask = attn_mask[:, -_slen:] # embeddings - tensor = self.embeddings(input_ids) - tensor = tensor + self.position_embeddings(position_ids) + if inputs_embeds is None: + inputs_embeds = self.embeddings(input_ids) + + tensor = inputs_embeds + self.position_embeddings(position_ids) if langs is not None and self.use_lang_emb: tensor = tensor + self.lang_embeddings(langs) if token_type_ids is not None: diff --git a/transformers/modeling_tf_xlnet.py b/transformers/modeling_tf_xlnet.py index 7ab95e7c9f..4733ea8589 100644 --- a/transformers/modeling_tf_xlnet.py +++ b/transformers/modeling_tf_xlnet.py @@ -487,7 +487,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): return pos_emb def call(self, inputs, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, - token_type_ids=None, input_mask=None, head_mask=None, training=False): + token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, training=False): if isinstance(inputs, (tuple, list)): input_ids = inputs[0] attention_mask = inputs[1] if len(inputs) > 1 else attention_mask @@ -497,7 +497,8 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): token_type_ids = inputs[5] if len(inputs) > 5 else token_type_ids input_mask = inputs[6] if len(inputs) > 6 else input_mask head_mask = inputs[7] if len(inputs) > 7 else head_mask - assert len(inputs) <= 8, "Too many inputs." + inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds + assert len(inputs) <= 9, "Too many inputs." elif isinstance(inputs, dict): input_ids = inputs.get('input_ids') attention_mask = inputs.get('attention_mask', attention_mask) @@ -507,7 +508,8 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): token_type_ids = inputs.get('token_type_ids', token_type_ids) input_mask = inputs.get('input_mask', input_mask) head_mask = inputs.get('head_mask', head_mask) - assert len(inputs) <= 8, "Too many inputs." + inputs_embeds = inputs.get('inputs_embeds', inputs_embeds) + assert len(inputs) <= 9, "Too many inputs." else: input_ids = inputs @@ -515,14 +517,23 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): # but we want a unified interface in the library with the batch size on the first dimension # so we move here the first dimension (batch) to the end - input_ids = tf.transpose(input_ids, perm=(1, 0)) + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_ids = tf.transpose(input_ids, perm=(1, 0)) + qlen, bsz = shape_list(input_ids)[:2] + elif inputs_embeds is not None: + inputs_embeds = tf.transpose(inputs_embeds, perm=(1, 0, 2)) + qlen, bsz = shape_list(inputs_embeds)[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + token_type_ids = tf.transpose(token_type_ids, perm=(1, 0)) if token_type_ids is not None else None input_mask = tf.transpose(input_mask, perm=(1, 0)) if input_mask is not None else None attention_mask = tf.transpose(attention_mask, perm=(1, 0)) if attention_mask is not None else None perm_mask = tf.transpose(perm_mask, perm=(1, 2, 0)) if perm_mask is not None else None target_mapping = tf.transpose(target_mapping, perm=(1, 2, 0)) if target_mapping is not None else None - qlen, bsz = shape_list(input_ids)[:2] mlen = shape_list(mems[0])[0] if mems is not None and mems[0] is not None else 0 klen = mlen + qlen @@ -573,7 +584,10 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): non_tgt_mask = None ##### Word embeddings and prepare h & g hidden states - word_emb_k = self.word_embedding(input_ids) + if inputs_embeds is not None: + word_emb_k = inputs_embeds + else: + word_emb_k = self.word_embedding(input_ids) output_h = self.dropout(word_emb_k, training=training) if target_mapping is not None: word_emb_q = tf.tile(self.mask_emb, [tf.shape(target_mapping)[0], bsz, 1]) diff --git a/transformers/tests/modeling_tf_bert_test.py b/transformers/tests/modeling_tf_bert_test.py index a1715d2568..bcee97435e 100644 --- a/transformers/tests/modeling_tf_bert_test.py +++ b/transformers/tests/modeling_tf_bert_test.py @@ -131,10 +131,6 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester): def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = TFBertModel(config=config) - # inputs = {'input_ids': input_ids, - # 'attention_mask': input_mask, - # 'token_type_ids': token_type_ids} - # sequence_output, pooled_output = model(**inputs) inputs = {'input_ids': input_ids, 'attention_mask': input_mask, 'token_type_ids': token_type_ids} diff --git a/transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py index ca7beb0925..2bb7cc9c5f 100644 --- a/transformers/tests/modeling_tf_common_test.py +++ b/transformers/tests/modeling_tf_common_test.py @@ -411,6 +411,27 @@ class TFCommonTestCases: first, second = model(inputs_dict, training=False)[0], model(inputs_dict, training=False)[0] self.assertTrue(tf.math.equal(first, second).numpy().all()) + def test_inputs_embeds(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + input_ids = inputs_dict["input_ids"] + del inputs_dict["input_ids"] + + for model_class in self.all_model_classes: + model = model_class(config) + + wte = model.get_input_embeddings() + try: + x = wte(input_ids, mode="embedding") + except: + try: + x = wte([input_ids], mode="embedding") + except: + x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32) + # ^^ In our TF models, the input_embeddings can take slightly different forms, + # so we try two of them and fall back to just synthetically creating a dummy tensor of ones. + inputs_dict["inputs_embeds"] = x + outputs = model(inputs_dict) + def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None): """Creates a random int32 tensor of the shape within the vocab size."""