From 86cff21cf60b86859934f6e6d867c95cceb8d6ac Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Tue, 29 Mar 2022 18:04:20 +0200 Subject: [PATCH] Fix some TF GPT-J CI testings (#16454) * Fix for test_mixed_precision * Fix test_saved_model_creation by using shape_list instead of shape * skit test_model_from_pretrained on GPU for now to avoid GPU OOM * skip test_gptj_sample_max_time for now Co-authored-by: ydshieh --- src/transformers/models/gptj/modeling_tf_gptj.py | 16 ++++++++-------- tests/gptj/test_modeling_tf_gptj.py | 5 +++++ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/gptj/modeling_tf_gptj.py b/src/transformers/models/gptj/modeling_tf_gptj.py index 6c24d74769..4bb9ec4a81 100644 --- a/src/transformers/models/gptj/modeling_tf_gptj.py +++ b/src/transformers/models/gptj/modeling_tf_gptj.py @@ -59,13 +59,13 @@ GPTJ_PRETRAINED_MODEL_ARCHIVE_LIST = [ def fixed_pos_embedding(x: tf.Tensor, seq_dim: int = 1, seq_len: Optional[int] = None) -> Tuple[tf.Tensor, tf.Tensor]: - dim = x.shape[-1] + dim = shape_list(x)[-1] if seq_len is None: - seq_len = x.shape[seq_dim] + seq_len = shape_list(x)[seq_dim] inv_freq = tf.cast(1.0 / (10000 ** (tf.range(0, dim, 2) / dim)), tf.float32) seq_len_range = tf.cast(tf.range(seq_len), tf.float32) sinusoid_inp = tf.cast(tf.einsum("i , j -> i j", seq_len_range, inv_freq), tf.float32) - return tf.sin(sinusoid_inp), tf.cos(sinusoid_inp) + return tf.cast(tf.sin(sinusoid_inp), dtype=x.dtype), tf.cast(tf.cos(sinusoid_inp), dtype=x.dtype) def rotate_every_two(x: tf.Tensor) -> tf.Tensor: @@ -77,8 +77,8 @@ def rotate_every_two(x: tf.Tensor) -> tf.Tensor: def apply_rotary_pos_emb(x: tf.Tensor, sincos: tf.Tensor, offset: int = 0) -> tf.Tensor: sin_pos, cos_pos = sincos - sin_pos = tf.repeat(sin_pos[None, offset : x.shape[1] + offset, None, :], 2, 3) - cos_pos = tf.repeat(cos_pos[None, offset : x.shape[1] + offset, None, :], 2, 3) + sin_pos = tf.repeat(sin_pos[None, offset : shape_list(x)[1] + offset, None, :], 2, 3) + cos_pos = tf.repeat(cos_pos[None, offset : shape_list(x)[1] + offset, None, :], 2, 3) return (x * cos_pos) + (rotate_every_two(x) * sin_pos) @@ -173,7 +173,7 @@ class TFGPTJAttention(tf.keras.layers.Layer): head_mask: Optional[tf.Tensor] = None, ) -> Tuple[tf.Tensor, tf.Tensor]: # compute causal mask from causal mask buffer - query_length, key_length = query.shape[-2], key.shape[-2] + query_length, key_length = shape_list(query)[-2], shape_list(key)[-2] causal_mask = self.get_causal_mask(key_length, query_length) # Keep the attention weights computation in fp32 to avoid overflow issues @@ -218,11 +218,11 @@ class TFGPTJAttention(tf.keras.layers.Layer): key = self._split_heads(key, True) value = self._split_heads(value, False) - seq_len = key.shape[1] + seq_len = shape_list(key)[1] offset = 0 if layer_past is not None: - offset = layer_past[0].shape[-2] + offset = shape_list(layer_past[0])[-2] seq_len += offset if self.rotary_dim is not None: diff --git a/tests/gptj/test_modeling_tf_gptj.py b/tests/gptj/test_modeling_tf_gptj.py index 50bcf1cc8a..32ce3f8564 100644 --- a/tests/gptj/test_modeling_tf_gptj.py +++ b/tests/gptj/test_modeling_tf_gptj.py @@ -345,6 +345,10 @@ class TFGPTJModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, unittest.TestC assert name is None @slow + @unittest.skipIf( + not is_tf_available() or len(tf.config.list_physical_devices("GPU")) > 0, + "skip testing on GPU for now to avoid GPU OOM.", + ) def test_model_from_pretrained(self): model = TFGPTJModel.from_pretrained("EleutherAI/gpt-j-6B", from_pt=True) self.assertIsNotNone(model) @@ -395,6 +399,7 @@ class TFGPTJModelLanguageGenerationTest(unittest.TestCase): ) # token_type_ids should change output @slow + @unittest.skip(reason="TF generate currently has no time-based stopping criteria") def test_gptj_sample_max_time(self): tokenizer = AutoTokenizer.from_pretrained("anton-l/gpt-j-tiny-random") model = TFGPTJForCausalLM.from_pretrained("anton-l/gpt-j-tiny-random", from_pt=True)