Replace strided slice with tf.expand_dims (#10078)

* Replace tf.newaxis -> tf.expand_dims * Fix tests * Fix tests * Use reshape when a tensors needs a double expand * Fix GPT2 * Fix GPT2
2021-02-09 17:48:28 +01:00
parent e7381c4596
commit b82fe7d258
17 changed files with 58 additions and 47 deletions
--- a/src/transformers/models/roberta/modeling_tf_roberta.py
+++ b/src/transformers/models/roberta/modeling_tf_roberta.py
@@ -147,9 +147,9 @@ class TFRobertaEmbeddings(tf.keras.layers.Layer):
                # Create the position ids from the input token ids. Any padded tokens remain padded.
                position_ids = self.create_position_ids_from_input_ids(input_ids=input_ids)
            else:
-                position_ids = tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1)[
-                    tf.newaxis, :
-                ]
+                position_ids = tf.expand_dims(
+                    tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1), axis=0
+                )
                position_ids = tf.tile(input=position_ids, multiples=(input_shape[0], 1))

        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
@@ -533,7 +533,7 @@ class TFRobertaMainLayer(tf.keras.layers.Layer):
        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
        # this attention mask is more simple than the triangular masking of causal attention
        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = inputs["attention_mask"][:, tf.newaxis, tf.newaxis, :]
+        extended_attention_mask = tf.reshape(inputs["attention_mask"], (input_shape[0], 1, 1, input_shape[1]))

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for