From adb5c79ff2ffcd2e4a43a12f082cca55f7630a96 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Thu, 28 Nov 2019 15:51:43 +0100 Subject: [PATCH] update all tf.shape and tensor.shape to shape_list --- .../adding_a_new_model/modeling_tf_xxx.py | 6 ++--- transformers/__init__.py | 2 +- transformers/modeling_tf_albert.py | 27 ++++++++----------- transformers/modeling_tf_bert.py | 26 +++++++++--------- transformers/modeling_tf_ctrl.py | 2 +- transformers/modeling_tf_distilbert.py | 8 +++--- transformers/modeling_tf_gpt2.py | 2 +- transformers/modeling_tf_openai.py | 2 +- transformers/modeling_tf_roberta.py | 6 ++--- transformers/modeling_tf_transfo_xl.py | 2 +- .../modeling_tf_transfo_xl_utilities.py | 4 +-- transformers/modeling_tf_utils.py | 2 +- transformers/modeling_tf_xlnet.py | 13 +++++---- 13 files changed, 48 insertions(+), 54 deletions(-) diff --git a/templates/adding_a_new_model/modeling_tf_xxx.py b/templates/adding_a_new_model/modeling_tf_xxx.py index f1d898b47a..59f798bdbf 100644 --- a/templates/adding_a_new_model/modeling_tf_xxx.py +++ b/templates/adding_a_new_model/modeling_tf_xxx.py @@ -32,7 +32,7 @@ import numpy as np import tensorflow as tf from .configuration_xxx import XxxConfig -from .modeling_tf_utils import TFPreTrainedModel, get_initializer +from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) @@ -121,9 +121,9 @@ class TFXxxMainLayer(tf.keras.layers.Layer): input_ids = inputs if attention_mask is None: - attention_mask = tf.fill(tf.shape(input_ids), 1) + attention_mask = tf.fill(shape_list(input_ids), 1) if token_type_ids is None: - token_type_ids = tf.fill(tf.shape(input_ids), 0) + token_type_ids = tf.fill(shape_list(input_ids), 0) # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] diff --git a/transformers/__init__.py b/transformers/__init__.py index b29ad38e73..de25c24b9e 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -118,7 +118,7 @@ if is_torch_available(): # TensorFlow if is_tf_available(): - from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary + from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering, TFAutoModelWithLMHead) diff --git a/transformers/modeling_tf_albert.py b/transformers/modeling_tf_albert.py index b2bf66f750..164dc74320 100644 --- a/transformers/modeling_tf_albert.py +++ b/transformers/modeling_tf_albert.py @@ -16,18 +16,13 @@ """ TF 2.0 ALBERT model. """ from __future__ import absolute_import, division, print_function, unicode_literals -import json import logging -import math -import os import sys -from io import open -import numpy as np import tensorflow as tf from .configuration_albert import AlbertConfig -from .modeling_tf_utils import TFPreTrainedModel, get_initializer +from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list from .modeling_tf_bert import ACT2FN, TFBertSelfAttention from .file_utils import add_start_docstrings @@ -110,9 +105,9 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): input_ids, position_ids, token_type_ids, inputs_embeds = inputs if input_ids is not None: - input_shape = tf.shape(input_ids) + input_shape = shape_list(input_ids) else: - input_shape = tf.shape(inputs_embeds)[:-1] + input_shape = shape_list(inputs_embeds)[:-1] seq_length = input_shape[1] if position_ids is None: @@ -137,8 +132,8 @@ class TFAlbertEmbeddings(tf.keras.layers.Layer): Returns: float32 tensor with shape [batch_size, length, vocab_size]. """ - batch_size = tf.shape(inputs)[0] - length = tf.shape(inputs)[1] + batch_size = shape_list(inputs)[0] + length = shape_list(inputs)[1] x = tf.reshape(inputs, [-1, self.config.embedding_size]) logits = tf.matmul(x, self.word_embeddings, transpose_b=True) return tf.reshape(logits, [batch_size, length, self.config.vocab_size]) @@ -183,7 +178,7 @@ class TFAlbertSelfAttention(tf.keras.layers.Layer): def call(self, inputs, training=False): hidden_states, attention_mask, head_mask = inputs - batch_size = tf.shape(hidden_states)[0] + batch_size = shape_list(hidden_states)[0] mixed_query_layer = self.query(hidden_states) mixed_key_layer = self.key(hidden_states) mixed_value_layer = self.value(hidden_states) @@ -196,7 +191,7 @@ class TFAlbertSelfAttention(tf.keras.layers.Layer): # (batch size, num_heads, seq_len_q, seq_len_k) attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) # scale attention_scores - dk = tf.cast(tf.shape(key_layer)[-1], tf.float32) + dk = tf.cast(shape_list(key_layer)[-1], tf.float32) attention_scores = attention_scores / tf.math.sqrt(dk) if attention_mask is not None: @@ -264,7 +259,7 @@ class TFAlbertAttention(TFBertSelfAttention): def call(self, inputs, training=False): input_tensor, attention_mask, head_mask = inputs - batch_size = tf.shape(input_tensor)[0] + batch_size = shape_list(input_tensor)[0] mixed_query_layer = self.query(input_tensor) mixed_key_layer = self.key(input_tensor) mixed_value_layer = self.value(input_tensor) @@ -277,7 +272,7 @@ class TFAlbertAttention(TFBertSelfAttention): # (batch size, num_heads, seq_len_q, seq_len_k) attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) # scale attention_scores - dk = tf.cast(tf.shape(key_layer)[-1], tf.float32) + dk = tf.cast(shape_list(key_layer)[-1], tf.float32) attention_scores = attention_scores / tf.math.sqrt(dk) if attention_mask is not None: @@ -645,9 +640,9 @@ class TFAlbertModel(TFAlbertPreTrainedModel): if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: - input_shape = tf.shape(input_ids) + input_shape = shape_list(input_ids) elif inputs_embeds is not None: - input_shape = inputs_embeds.shape[:-1] + input_shape = shape_list(inputs_embeds)[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py index ad0815e2ca..5aa7bb3da2 100644 --- a/transformers/modeling_tf_bert.py +++ b/transformers/modeling_tf_bert.py @@ -28,7 +28,7 @@ import numpy as np import tensorflow as tf from .configuration_bert import BertConfig -from .modeling_tf_utils import TFPreTrainedModel, get_initializer +from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) @@ -145,9 +145,9 @@ class TFBertEmbeddings(tf.keras.layers.Layer): input_ids, position_ids, token_type_ids, inputs_embeds = inputs if input_ids is not None: - input_shape = tf.shape(input_ids) + input_shape = shape_list(input_ids) else: - input_shape = tf.shape(inputs_embeds)[:-1] + input_shape = shape_list(inputs_embeds)[:-1] seq_length = input_shape[1] if position_ids is None: @@ -172,8 +172,8 @@ class TFBertEmbeddings(tf.keras.layers.Layer): Returns: float32 tensor with shape [batch_size, length, vocab_size]. """ - batch_size = tf.shape(inputs)[0] - length = tf.shape(inputs)[1] + batch_size = shape_list(inputs)[0] + length = shape_list(inputs)[1] x = tf.reshape(inputs, [-1, self.hidden_size]) logits = tf.matmul(x, self.word_embeddings, transpose_b=True) @@ -214,7 +214,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer): def call(self, inputs, training=False): hidden_states, attention_mask, head_mask = inputs - batch_size = tf.shape(hidden_states)[0] + batch_size = shape_list(hidden_states)[0] mixed_query_layer = self.query(hidden_states) mixed_key_layer = self.key(hidden_states) mixed_value_layer = self.value(hidden_states) @@ -225,7 +225,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer): # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) # (batch size, num_heads, seq_len_q, seq_len_k) - dk = tf.cast(tf.shape(key_layer)[-1], tf.float32) # scale attention_scores + dk = tf.cast(shape_list(key_layer)[-1], tf.float32) # scale attention_scores attention_scores = attention_scores / tf.math.sqrt(dk) if attention_mask is not None: @@ -502,9 +502,9 @@ class TFBertMainLayer(tf.keras.layers.Layer): if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: - input_shape = input_ids.shape + input_shape = shape_list(input_ids) elif inputs_embeds is not None: - input_shape = inputs_embeds.shape[:-1] + input_shape = shape_list(inputs_embeds)[:-1] else: raise ValueError("You have to specify either input_ids or inputs_embeds") @@ -939,11 +939,11 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel): input_ids = inputs if input_ids is not None: - num_choices = tf.shape(input_ids)[1] - seq_length = tf.shape(input_ids)[2] + num_choices = shape_list(input_ids)[1] + seq_length = shape_list(input_ids)[2] else: - num_choices = tf.shape(inputs_embeds)[1] - seq_length = tf.shape(inputs_embeds)[2] + num_choices = shape_list(inputs_embeds)[1] + seq_length = shape_list(inputs_embeds)[2] flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None diff --git a/transformers/modeling_tf_ctrl.py b/transformers/modeling_tf_ctrl.py index ae66dbc82c..6d0d6a57ad 100644 --- a/transformers/modeling_tf_ctrl.py +++ b/transformers/modeling_tf_ctrl.py @@ -95,7 +95,7 @@ class TFMultiHeadAttention(tf.keras.layers.Layer): def call(self, inputs, training=False): v, k, q, mask, layer_past, attention_mask, head_mask = inputs - batch_size = q.shape[0] + batch_size = shape_list(q)[0] q = self.Wq(q) k = self.Wk(k) diff --git a/transformers/modeling_tf_distilbert.py b/transformers/modeling_tf_distilbert.py index 6d393bb95d..b3d4889475 100644 --- a/transformers/modeling_tf_distilbert.py +++ b/transformers/modeling_tf_distilbert.py @@ -137,9 +137,9 @@ class TFEmbeddings(tf.keras.layers.Layer): input_ids, position_ids = inputs if input_ids is not None: - seq_length = tf.shape(input_ids)[1] + seq_length = shape_list(input_ids)[1] else: - seq_length = tf.shape(inputs_embeds)[1] + seq_length = shape_list(inputs_embeds)[1] if position_ids is None: position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] @@ -160,8 +160,8 @@ class TFEmbeddings(tf.keras.layers.Layer): Returns: float32 tensor with shape [batch_size, length, vocab_size]. """ - batch_size = tf.shape(inputs)[0] - length = tf.shape(inputs)[1] + batch_size = shape_list(inputs)[0] + length = shape_list(inputs)[1] x = tf.reshape(inputs, [-1, self.dim]) logits = tf.matmul(x, self.word_embeddings, transpose_b=True) diff --git a/transformers/modeling_tf_gpt2.py b/transformers/modeling_tf_gpt2.py index 5e416a5e3a..aebe790114 100644 --- a/transformers/modeling_tf_gpt2.py +++ b/transformers/modeling_tf_gpt2.py @@ -92,7 +92,7 @@ class TFAttention(tf.keras.layers.Layer): # q, k, v have shape [batch, heads, sequence, features] w = tf.matmul(q, k, transpose_b=True) if self.scale: - dk = tf.cast(tf.shape(k)[-1], tf.float32) # scale attention_scores + dk = tf.cast(shape_list(k)[-1], tf.float32) # scale attention_scores w = w / tf.math.sqrt(dk) # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst. diff --git a/transformers/modeling_tf_openai.py b/transformers/modeling_tf_openai.py index c553d92317..dac3b17590 100644 --- a/transformers/modeling_tf_openai.py +++ b/transformers/modeling_tf_openai.py @@ -98,7 +98,7 @@ class TFAttention(tf.keras.layers.Layer): # q, k, v have shape [batch, heads, sequence, features] w = tf.matmul(q, k, transpose_b=True) if self.scale: - dk = tf.cast(tf.shape(k)[-1], tf.float32) # scale attention_scores + dk = tf.cast(shape_list(k)[-1], tf.float32) # scale attention_scores w = w / tf.math.sqrt(dk) # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst. diff --git a/transformers/modeling_tf_roberta.py b/transformers/modeling_tf_roberta.py index 450c0c72f2..954279f873 100644 --- a/transformers/modeling_tf_roberta.py +++ b/transformers/modeling_tf_roberta.py @@ -24,7 +24,7 @@ import numpy as np import tensorflow as tf from .configuration_roberta import RobertaConfig -from .modeling_tf_utils import TFPreTrainedModel, get_initializer +from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list from .file_utils import add_start_docstrings from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu, gelu_new @@ -51,9 +51,9 @@ class TFRobertaEmbeddings(TFBertEmbeddings): input_ids, position_ids, token_type_ids, inputs_embeds = inputs if input_ids is not None: - seq_length = tf.shape(input_ids)[1] + seq_length = shape_list(input_ids)[1] else: - seq_length = tf.shape(inputs_embeds)[1] + seq_length = shape_list(inputs_embeds)[1] if position_ids is None: position_ids = tf.range(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=tf.int32)[tf.newaxis, :] diff --git a/transformers/modeling_tf_transfo_xl.py b/transformers/modeling_tf_transfo_xl.py index 8a8d11cfbc..fd325e218e 100644 --- a/transformers/modeling_tf_transfo_xl.py +++ b/transformers/modeling_tf_transfo_xl.py @@ -337,7 +337,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer): emb_i = tf.einsum('id,de->ie', emb_i, self.emb_projs[i]) mask_idx = tf.cast(tf.where(mask_i), dtype=tf.int64) - emb_flat += tf.scatter_nd(mask_idx, emb_i, tf.cast(tf.shape(emb_flat), dtype=tf.int64)) + emb_flat += tf.scatter_nd(mask_idx, emb_i, tf.cast(shape_list(emb_flat), dtype=tf.int64)) embed_shape = shape_list(inp) + [self.d_proj] embed = tf.reshape(emb_flat, embed_shape) diff --git a/transformers/modeling_tf_transfo_xl_utilities.py b/transformers/modeling_tf_transfo_xl_utilities.py index d7666a650e..e6a6dfe686 100644 --- a/transformers/modeling_tf_transfo_xl_utilities.py +++ b/transformers/modeling_tf_transfo_xl_utilities.py @@ -105,7 +105,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): @staticmethod def _gather_logprob(logprob, target): - lp_size = tf.shape(logprob) + lp_size = shape_list(logprob) r = tf.range(lp_size[0]) idx = tf.stack([r, target], 1) return tf.gather_nd(logprob, idx) @@ -159,7 +159,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): cur_logprob = self._gather_logprob(cur_tail_logprob, cur_target) cur_logprob += cur_head_logprob[:, self.cutoff_ends[1] + i - 1] if target is not None: - loss += tf.scatter_nd(mask_idx, -cur_logprob, tf.cast(tf.shape(loss), dtype=tf.int64)) + loss += tf.scatter_nd(mask_idx, -cur_logprob, tf.cast(shape_list(loss), dtype=tf.int64)) out = tf.concat(out, axis=-1) if target is not None: diff --git a/transformers/modeling_tf_utils.py b/transformers/modeling_tf_utils.py index 569b2faa4b..e4ba55e25e 100644 --- a/transformers/modeling_tf_utils.py +++ b/transformers/modeling_tf_utils.py @@ -494,7 +494,7 @@ class TFSequenceSummary(tf.keras.layers.Layer): def shape_list(x): """Deal with dynamic shape in tensorflow cleanly.""" static = x.shape.as_list() - dynamic = tf.shape(x) + dynamic = shape_list(x) return [dynamic[i] if s is None else s for i, s in enumerate(static)] def get_initializer(initializer_range=0.02): diff --git a/transformers/modeling_tf_xlnet.py b/transformers/modeling_tf_xlnet.py index 4733ea8589..215d906f57 100644 --- a/transformers/modeling_tf_xlnet.py +++ b/transformers/modeling_tf_xlnet.py @@ -112,8 +112,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): def prune_heads(self, heads): raise NotImplementedError - @staticmethod - def rel_shift(x, klen=-1): + def rel_shift(self, x, klen=-1): """perform relative shift to form the relative attention score.""" x_size = shape_list(x) @@ -135,7 +134,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): # position based attention score bd = tf.einsum('ibnd,jbnd->ijbn', q_head + self.r_r_bias, k_head_r) - bd = self.rel_shift(bd, klen=ac.shape[1]) + bd = self.rel_shift(bd, klen=shape_list(ac)[1]) # segment based attention score if seg_mat is None: @@ -192,7 +191,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): if g is not None: ###### Two-stream attention with relative positional encoding. # content based attention score - if mems is not None and mems.shape.ndims > 1: + if mems is not None and len(shape_list(mems)) > 1: cat = tf.concat([mems, h], axis=0) else: cat = h @@ -252,7 +251,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): else: ###### Multi-head attention with relative positional encoding - if mems is not None and mems.shape.ndims > 1: + if mems is not None and len(shape_list(mems)) > 1: cat = tf.concat([mems, h], axis=0) else: cat = h @@ -565,7 +564,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): if data_mask is not None: # all mems can be attended to - mems_mask = tf.zeros([tf.shape(data_mask)[0], mlen, bsz], + mems_mask = tf.zeros([shape_list(data_mask)[0], mlen, bsz], dtype=dtype_float) data_mask = tf.concat([mems_mask, data_mask], axis=1) if attn_mask is None: @@ -590,7 +589,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): word_emb_k = self.word_embedding(input_ids) output_h = self.dropout(word_emb_k, training=training) if target_mapping is not None: - word_emb_q = tf.tile(self.mask_emb, [tf.shape(target_mapping)[0], bsz, 1]) + word_emb_q = tf.tile(self.mask_emb, [shape_list(target_mapping)[0], bsz, 1]) # else: # We removed the inp_q input which was same as target mapping # inp_q_ext = inp_q[:, :, None] # word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k