diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index 160a68c920..bbcbf125d8 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -560,6 +560,18 @@ def input_processing(func, config, **kwargs): if "kwargs" in output: del output["kwargs"] + cast_output = dict() + for key, val in output.items(): + if isinstance(val, tf.Tensor) and val.dtype == tf.int32: + cast_output[key] = tf.cast(val, tf.int64) + elif isinstance(val, np.ndarray) and val.dtype == np.int32: + cast_output[key] = val.astype(np.int64) + else: + cast_output[key] = val + + output = cast_output + del cast_output + if config is not None: boolean_dict = { k: v @@ -1054,9 +1066,9 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu @tf.function( input_signature=[ { - "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"), - "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"), - "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"), + "input_ids": tf.TensorSpec((None, None), tf.int64, name="input_ids"), + "attention_mask": tf.TensorSpec((None, None), tf.int64, name="attention_mask"), + "token_type_ids": tf.TensorSpec((None, None), tf.int64, name="token_type_ids"), } ] ) @@ -1082,6 +1094,29 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu """ raise NotImplementedError + def save( + self, + filepath, + overwrite=True, + include_optimizer=True, + save_format=None, + signatures=None, + options=None, + save_traces=True, + ): + # Very simple wrapper that ensures we set the correct serving signature when saving + if signatures is None and hasattr(self, "serving"): + signatures = self.serving + super().save( + filepath, + overwrite=overwrite, + include_optimizer=include_optimizer, + save_format=save_format, + signatures=signatures, + options=options, + save_traces=save_traces, + ) + def get_input_embeddings(self) -> tf.keras.layers.Layer: """ Returns the model's input embeddings layer. diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py index 3f448b4d2e..f4e9532817 100644 --- a/src/transformers/models/bart/modeling_tf_bart.py +++ b/src/transformers/models/bart/modeling_tf_bart.py @@ -64,11 +64,15 @@ LARGE_NEGATIVE = -1e8 def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int): pad_token_id = tf.cast(pad_token_id, input_ids.dtype) decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype) - start_tokens = tf.fill((shape_list(input_ids)[0], 1), decoder_start_token_id) + start_tokens = tf.fill( + (shape_list(input_ids)[0], 1), tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype) + ) shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1) # replace possible -100 values in labels by `pad_token_id` shifted_input_ids = tf.where( - shifted_input_ids == -100, tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids + shifted_input_ids == -100, + tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)), + shifted_input_ids, ) # "Verify that `labels` has only positive values and -100" @@ -475,10 +479,10 @@ class TFBartPretrainedModel(TFPreTrainedModel): @tf.function( input_signature=[ { - "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"), - "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"), - "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"), - "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"), + "input_ids": tf.TensorSpec((None, None), tf.int64, name="input_ids"), + "attention_mask": tf.TensorSpec((None, None), tf.int64, name="attention_mask"), + "decoder_input_ids": tf.TensorSpec((None, None), tf.int64, name="decoder_input_ids"), + "decoder_attention_mask": tf.TensorSpec((None, None), tf.int64, name="decoder_attention_mask"), } ] ) diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py index 8ab88b9730..b1240b4763 100644 --- a/src/transformers/models/bert/modeling_tf_bert.py +++ b/src/transformers/models/bert/modeling_tf_bert.py @@ -1799,9 +1799,9 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): @tf.function( input_signature=[ { - "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"), - "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"), - "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"), + "input_ids": tf.TensorSpec((None, None, None), tf.int64, name="input_ids"), + "attention_mask": tf.TensorSpec((None, None, None), tf.int64, name="attention_mask"), + "token_type_ids": tf.TensorSpec((None, None, None), tf.int64, name="token_type_ids"), } ] ) diff --git a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py index 7843e057f3..c56b30cf83 100644 --- a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py @@ -66,11 +66,15 @@ LARGE_NEGATIVE = -1e8 def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int): pad_token_id = tf.cast(pad_token_id, input_ids.dtype) decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype) - start_tokens = tf.fill((shape_list(input_ids)[0], 1), decoder_start_token_id) + start_tokens = tf.fill( + (shape_list(input_ids)[0], 1), tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype) + ) shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1) # replace possible -100 values in labels by `pad_token_id` shifted_input_ids = tf.where( - shifted_input_ids == -100, tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids + shifted_input_ids == -100, + tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)), + shifted_input_ids, ) # "Verify that `labels` has only positive values and -100" diff --git a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py index 2f1a94ba96..8383dc0972 100644 --- a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py @@ -65,11 +65,15 @@ LARGE_NEGATIVE = -1e8 def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int): pad_token_id = tf.cast(pad_token_id, input_ids.dtype) decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype) - start_tokens = tf.fill((shape_list(input_ids)[0], 1), decoder_start_token_id) + start_tokens = tf.fill( + (shape_list(input_ids)[0], 1), tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype) + ) shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1) # replace possible -100 values in labels by `pad_token_id` shifted_input_ids = tf.where( - shifted_input_ids == -100, tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids + shifted_input_ids == -100, + tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)), + shifted_input_ids, ) # "Verify that `labels` has only positive values and -100" diff --git a/src/transformers/models/clip/modeling_tf_clip.py b/src/transformers/models/clip/modeling_tf_clip.py index 94656a0b39..df49006210 100644 --- a/src/transformers/models/clip/modeling_tf_clip.py +++ b/src/transformers/models/clip/modeling_tf_clip.py @@ -1097,8 +1097,8 @@ class TFCLIPTextModel(TFCLIPPreTrainedModel): @tf.function( input_signature=[ { - "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"), - "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"), + "input_ids": tf.TensorSpec((None, None), tf.int64, name="input_ids"), + "attention_mask": tf.TensorSpec((None, None), tf.int64, name="attention_mask"), } ] ) diff --git a/src/transformers/models/convbert/modeling_tf_convbert.py b/src/transformers/models/convbert/modeling_tf_convbert.py index 81e3b0b019..f62718af5f 100644 --- a/src/transformers/models/convbert/modeling_tf_convbert.py +++ b/src/transformers/models/convbert/modeling_tf_convbert.py @@ -1127,9 +1127,9 @@ class TFConvBertForMultipleChoice(TFConvBertPreTrainedModel, TFMultipleChoiceLos @tf.function( input_signature=[ { - "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"), - "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"), - "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"), + "input_ids": tf.TensorSpec((None, None, None), tf.int64, name="input_ids"), + "attention_mask": tf.TensorSpec((None, None, None), tf.int64, name="attention_mask"), + "token_type_ids": tf.TensorSpec((None, None, None), tf.int64, name="token_type_ids"), } ] ) diff --git a/src/transformers/models/distilbert/modeling_tf_distilbert.py b/src/transformers/models/distilbert/modeling_tf_distilbert.py index 64c3338b64..bc1df7f41a 100644 --- a/src/transformers/models/distilbert/modeling_tf_distilbert.py +++ b/src/transformers/models/distilbert/modeling_tf_distilbert.py @@ -424,8 +424,8 @@ class TFDistilBertPreTrainedModel(TFPreTrainedModel): @tf.function( input_signature=[ { - "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"), - "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"), + "input_ids": tf.TensorSpec((None, None), tf.int64, name="input_ids"), + "attention_mask": tf.TensorSpec((None, None), tf.int64, name="attention_mask"), } ] ) diff --git a/src/transformers/models/dpr/modeling_tf_dpr.py b/src/transformers/models/dpr/modeling_tf_dpr.py index 96ee761b81..c166cd0486 100644 --- a/src/transformers/models/dpr/modeling_tf_dpr.py +++ b/src/transformers/models/dpr/modeling_tf_dpr.py @@ -376,8 +376,8 @@ class TFDPRPretrainedReader(TFPreTrainedModel): @tf.function( input_signature=[ { - "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"), - "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"), + "input_ids": tf.TensorSpec((None, None), tf.int64, name="input_ids"), + "attention_mask": tf.TensorSpec((None, None), tf.int64, name="attention_mask"), } ] ) diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py index 2bf59d0c3d..da71be87d9 100644 --- a/src/transformers/models/funnel/modeling_tf_funnel.py +++ b/src/transformers/models/funnel/modeling_tf_funnel.py @@ -1511,9 +1511,9 @@ class TFFunnelForMultipleChoice(TFFunnelPreTrainedModel, TFMultipleChoiceLoss): @tf.function( input_signature=[ { - "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"), + "input_ids": tf.TensorSpec((None, None), tf.int64, name="input_ids"), "attention_mask": tf.TensorSpec((None, None), tf.float32, name="attention_mask"), - "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"), + "token_type_ids": tf.TensorSpec((None, None), tf.int64, name="token_type_ids"), } ] ) diff --git a/src/transformers/models/gpt2/modeling_tf_gpt2.py b/src/transformers/models/gpt2/modeling_tf_gpt2.py index a876511fff..5b1e21a3c2 100644 --- a/src/transformers/models/gpt2/modeling_tf_gpt2.py +++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py @@ -548,8 +548,8 @@ class TFGPT2PreTrainedModel(TFPreTrainedModel): @tf.function( input_signature=[ { - "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"), - "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"), + "input_ids": tf.TensorSpec((None, None), tf.int64, name="input_ids"), + "attention_mask": tf.TensorSpec((None, None), tf.int64, name="attention_mask"), } ] ) diff --git a/src/transformers/models/gptj/modeling_tf_gptj.py b/src/transformers/models/gptj/modeling_tf_gptj.py index 943d9b1fff..67d17cdc05 100644 --- a/src/transformers/models/gptj/modeling_tf_gptj.py +++ b/src/transformers/models/gptj/modeling_tf_gptj.py @@ -527,8 +527,8 @@ class TFGPTJPreTrainedModel(TFPreTrainedModel): @tf.function( input_signature=[ { - "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"), - "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"), + "input_ids": tf.TensorSpec((None, None), tf.int64, name="input_ids"), + "attention_mask": tf.TensorSpec((None, None), tf.int64, name="attention_mask"), } ] ) diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py index e9cf9adb76..ec7458a7eb 100644 --- a/src/transformers/models/hubert/modeling_tf_hubert.py +++ b/src/transformers/models/hubert/modeling_tf_hubert.py @@ -1312,8 +1312,8 @@ class TFHubertPreTrainedModel(TFPreTrainedModel): input_signature=[ { "input_values": tf.TensorSpec((None, None), tf.float32, name="input_values"), - "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"), - "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"), + "attention_mask": tf.TensorSpec((None, None), tf.int64, name="attention_mask"), + "token_type_ids": tf.TensorSpec((None, None), tf.int64, name="token_type_ids"), } ] ) diff --git a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py index 418ab2b148..6bb3eb54e1 100644 --- a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py @@ -988,10 +988,10 @@ class TFLayoutLMv3PreTrainedModel(TFPreTrainedModel): @tf.function( input_signature=[ { - "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"), - "bbox": tf.TensorSpec((None, None, 4), tf.int32, name="bbox"), + "input_ids": tf.TensorSpec((None, None), tf.int64, name="input_ids"), + "bbox": tf.TensorSpec((None, None, 4), tf.int64, name="bbox"), "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"), - "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"), + "attention_mask": tf.TensorSpec((None, None), tf.int64, name="attention_mask"), } ] ) diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py index a91f769fbf..3702f1cca3 100644 --- a/src/transformers/models/led/modeling_tf_led.py +++ b/src/transformers/models/led/modeling_tf_led.py @@ -55,16 +55,23 @@ _TOKENIZER_FOR_DOC = "LEDTokenizer" LARGE_NEGATIVE = -1e8 +# Copied from transformers.models.bart.modeling_tf_bart.shift_tokens_right def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int): - start_tokens = tf.fill((shape_list(input_ids)[0], 1), decoder_start_token_id) + pad_token_id = tf.cast(pad_token_id, input_ids.dtype) + decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype) + start_tokens = tf.fill( + (shape_list(input_ids)[0], 1), tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype) + ) shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1) # replace possible -100 values in labels by `pad_token_id` shifted_input_ids = tf.where( - shifted_input_ids == -100, tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids + shifted_input_ids == -100, + tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)), + shifted_input_ids, ) # "Verify that `labels` has only positive values and -100" - assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0)) + assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=input_ids.dtype)) # Make sure the assertion op is called by wrapping the result in an identity no-op with tf.control_dependencies([assert_gte0]): diff --git a/src/transformers/models/lxmert/modeling_tf_lxmert.py b/src/transformers/models/lxmert/modeling_tf_lxmert.py index 88535aee12..3b44056a6c 100644 --- a/src/transformers/models/lxmert/modeling_tf_lxmert.py +++ b/src/transformers/models/lxmert/modeling_tf_lxmert.py @@ -800,12 +800,12 @@ class TFLxmertPreTrainedModel(TFPreTrainedModel): @tf.function( input_signature=[ { - "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"), - "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"), + "input_ids": tf.TensorSpec((None, None), tf.int64, name="input_ids"), + "attention_mask": tf.TensorSpec((None, None), tf.int64, name="attention_mask"), "visual_feats": tf.TensorSpec((None, None, None), tf.float32, name="visual_feats"), "visual_pos": tf.TensorSpec((None, None, None), tf.float32, name="visual_pos"), - "visual_attention_mask": tf.TensorSpec((None, None), tf.int32, name="visual_attention_mask"), - "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"), + "visual_attention_mask": tf.TensorSpec((None, None), tf.int64, name="visual_attention_mask"), + "token_type_ids": tf.TensorSpec((None, None), tf.int64, name="token_type_ids"), } ] ) diff --git a/src/transformers/models/marian/modeling_tf_marian.py b/src/transformers/models/marian/modeling_tf_marian.py index 01522346d8..580c38f843 100644 --- a/src/transformers/models/marian/modeling_tf_marian.py +++ b/src/transformers/models/marian/modeling_tf_marian.py @@ -65,11 +65,15 @@ LARGE_NEGATIVE = -1e8 def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int): pad_token_id = tf.cast(pad_token_id, input_ids.dtype) decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype) - start_tokens = tf.fill((shape_list(input_ids)[0], 1), decoder_start_token_id) + start_tokens = tf.fill( + (shape_list(input_ids)[0], 1), tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype) + ) shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1) # replace possible -100 values in labels by `pad_token_id` shifted_input_ids = tf.where( - shifted_input_ids == -100, tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids + shifted_input_ids == -100, + tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)), + shifted_input_ids, ) # "Verify that `labels` has only positive values and -100" diff --git a/src/transformers/models/mbart/modeling_tf_mbart.py b/src/transformers/models/mbart/modeling_tf_mbart.py index 0b7a81aa33..ec034a5fa8 100644 --- a/src/transformers/models/mbart/modeling_tf_mbart.py +++ b/src/transformers/models/mbart/modeling_tf_mbart.py @@ -69,11 +69,15 @@ def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int): if pad_token_id is None: raise ValueError("self.model.config.pad_token_id has to be defined.") # replace possible -100 values in labels by `pad_token_id` - input_ids = tf.where(input_ids == -100, tf.fill(shape_list(input_ids), pad_token_id), input_ids) + input_ids = tf.where( + input_ids == -100, tf.fill(shape_list(input_ids), tf.cast(pad_token_id, input_ids.dtype)), input_ids + ) language_id_index = ( tf.reduce_sum(tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=input_ids.dtype), axis=-1) - 1 ) - language_id_index = tf.stack([tf.range(shape_list(input_ids)[0]), language_id_index], axis=-1) + language_id_index = tf.stack( + [tf.range(shape_list(input_ids)[0], dtype=input_ids.dtype), language_id_index], axis=-1 + ) languages_ids = tf.gather_nd(input_ids, language_id_index) shifted_input_ids = tf.concat([tf.expand_dims(languages_ids, axis=-1), input_ids[:, :-1]], axis=-1) diff --git a/src/transformers/models/mpnet/modeling_tf_mpnet.py b/src/transformers/models/mpnet/modeling_tf_mpnet.py index 7b63dff489..3fc4318461 100644 --- a/src/transformers/models/mpnet/modeling_tf_mpnet.py +++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py @@ -76,8 +76,8 @@ class TFMPNetPreTrainedModel(TFPreTrainedModel): @tf.function( input_signature=[ { - "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"), - "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"), + "input_ids": tf.TensorSpec((None, None), tf.int64, name="input_ids"), + "attention_mask": tf.TensorSpec((None, None), tf.int64, name="attention_mask"), } ] ) diff --git a/src/transformers/models/openai/modeling_tf_openai.py b/src/transformers/models/openai/modeling_tf_openai.py index e1bb1a5f6f..357866228b 100644 --- a/src/transformers/models/openai/modeling_tf_openai.py +++ b/src/transformers/models/openai/modeling_tf_openai.py @@ -359,8 +359,8 @@ class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel): @tf.function( input_signature=[ { - "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"), - "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"), + "input_ids": tf.TensorSpec((None, None), tf.int64, name="input_ids"), + "attention_mask": tf.TensorSpec((None, None), tf.int64, name="attention_mask"), } ] ) diff --git a/src/transformers/models/opt/modeling_tf_opt.py b/src/transformers/models/opt/modeling_tf_opt.py index 8be1a8f091..b523ecceb1 100644 --- a/src/transformers/models/opt/modeling_tf_opt.py +++ b/src/transformers/models/opt/modeling_tf_opt.py @@ -425,8 +425,8 @@ class TFOPTPreTrainedModel(TFPreTrainedModel): @tf.function( input_signature=[ { - "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"), - "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"), + "input_ids": tf.TensorSpec((None, None), tf.int64, name="input_ids"), + "attention_mask": tf.TensorSpec((None, None), tf.int64, name="attention_mask"), } ] ) diff --git a/src/transformers/models/pegasus/modeling_tf_pegasus.py b/src/transformers/models/pegasus/modeling_tf_pegasus.py index 63017efb03..97efed9285 100644 --- a/src/transformers/models/pegasus/modeling_tf_pegasus.py +++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py @@ -65,11 +65,15 @@ LARGE_NEGATIVE = -1e8 def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int): pad_token_id = tf.cast(pad_token_id, input_ids.dtype) decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype) - start_tokens = tf.fill((shape_list(input_ids)[0], 1), decoder_start_token_id) + start_tokens = tf.fill( + (shape_list(input_ids)[0], 1), tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype) + ) shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1) # replace possible -100 values in labels by `pad_token_id` shifted_input_ids = tf.where( - shifted_input_ids == -100, tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids + shifted_input_ids == -100, + tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)), + shifted_input_ids, ) # "Verify that `labels` has only positive values and -100" diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py index a31b2d4521..08a9adf591 100644 --- a/src/transformers/models/rag/modeling_tf_rag.py +++ b/src/transformers/models/rag/modeling_tf_rag.py @@ -1301,17 +1301,18 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss pad_token_id = self.generator.config.pad_token_id assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined." - shifted_input_ids = tf.cast(input_ids, tf.int32) - start_tokens = tf.fill((shape_list(shifted_input_ids)[0], 1), start_token_id) - shifted_input_ids = tf.concat([start_tokens, shifted_input_ids[:, :-1]], -1) + start_tokens = tf.fill((shape_list(input_ids)[0], 1), tf.cast(start_token_id, input_ids.dtype)) + shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1) # replace possible -100 values in labels by `pad_token_id` shifted_input_ids = tf.where( - shifted_input_ids == -100, tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids + shifted_input_ids == -100, + tf.fill(shape_list(shifted_input_ids), tf.cast(pad_token_id, input_ids.dtype)), + shifted_input_ids, ) # "Verify that `labels` has only positive values and -100" - assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.cast(0, tf.int32)) + assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.cast(0, shifted_input_ids.dtype)) # Make sure the assertion op is called by wrapping the result in an identity no-op with tf.control_dependencies([assert_gte0]): @@ -1324,7 +1325,10 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss n_docs = n_docs if n_docs is not None else self.config.n_docs # shift tokens left (from original Pytorch's version) - target = tf.concat([target[:, 1:], tf.fill([target.shape[0], 1], self.config.generator.pad_token_id)], axis=1) + target = tf.concat( + [target[:, 1:], tf.fill([target.shape[0], 1], tf.cast(self.config.generator.pad_token_id, target.dtype))], + axis=1, + ) rag_logprobs = self.marginalize(seq_logits, doc_scores, n_docs) loss = self.hf_compute_loss(target, rag_logprobs, from_logits=True, reduce_loss=reduce_loss) @@ -1571,7 +1575,10 @@ class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingL self, seq_logits, doc_scores, target, reduce_loss=False, epsilon=0.0, exclude_bos_score=False, n_docs=None ): # shift tokens left - target = tf.concat([target[:, 1:], tf.fill([target.shape[0], 1], self.config.generator.pad_token_id)], axis=1) + target = tf.concat( + [target[:, 1:], tf.fill([target.shape[0], 1], tf.cast(self.config.generator.pad_token_id, target.dtype))], + axis=1, + ) # bos_token_id is None for T5 bos_token_id = self.config.bos_token_id or self.config.generator.bos_token_id @@ -1580,7 +1587,7 @@ class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingL use_bos = bos_token_id is not None and equal_bos_token_id_all def _mask_pads(ll, smooth_obj): - pad_mask = tf.equal(target, self.config.generator.pad_token_id) + pad_mask = tf.equal(target, tf.cast(self.config.generator.pad_token_id, target.dtype)) if tf.reduce_any(pad_mask): ll = tf.where(pad_mask, 0.0, ll) smooth_obj = tf.where(pad_mask, 0.0, smooth_obj) @@ -1611,7 +1618,7 @@ class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingL def torch_gather(param, id_tensor): # 2d-gather torch equivalent: https://stackoverflow.com/questions/52129909/tensorflow-equivalent-of-torch-gather def gather2d(target, id_tensor): - idx = tf.stack([tf.range(tf.shape(id_tensor)[0]), id_tensor[:, 0]], axis=-1) + idx = tf.stack([tf.range(tf.shape(id_tensor)[0], dtype=id_tensor.dtype), id_tensor[:, 0]], axis=-1) result = tf.gather_nd(target, idx) return tf.expand_dims(result, axis=-1) diff --git a/src/transformers/models/rembert/modeling_tf_rembert.py b/src/transformers/models/rembert/modeling_tf_rembert.py index aea9fa325b..be1946f111 100644 --- a/src/transformers/models/rembert/modeling_tf_rembert.py +++ b/src/transformers/models/rembert/modeling_tf_rembert.py @@ -1435,9 +1435,9 @@ class TFRemBertForMultipleChoice(TFRemBertPreTrainedModel, TFMultipleChoiceLoss) @tf.function( input_signature=[ { - "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"), - "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"), - "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"), + "input_ids": tf.TensorSpec((None, None, None), tf.int64, name="input_ids"), + "attention_mask": tf.TensorSpec((None, None, None), tf.int64, name="attention_mask"), + "token_type_ids": tf.TensorSpec((None, None, None), tf.int64, name="token_type_ids"), } ] ) diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py index c2e1477c7e..04f4283b20 100644 --- a/src/transformers/models/roberta/modeling_tf_roberta.py +++ b/src/transformers/models/roberta/modeling_tf_roberta.py @@ -798,8 +798,8 @@ class TFRobertaPreTrainedModel(TFPreTrainedModel): @tf.function( input_signature=[ { - "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"), - "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"), + "input_ids": tf.TensorSpec((None, None), tf.int64, name="input_ids"), + "attention_mask": tf.TensorSpec((None, None), tf.int64, name="attention_mask"), } ] ) diff --git a/src/transformers/models/roformer/modeling_tf_roformer.py b/src/transformers/models/roformer/modeling_tf_roformer.py index 852b1424b4..4c526b6941 100644 --- a/src/transformers/models/roformer/modeling_tf_roformer.py +++ b/src/transformers/models/roformer/modeling_tf_roformer.py @@ -1211,9 +1211,9 @@ class TFRoFormerForMultipleChoice(TFRoFormerPreTrainedModel, TFMultipleChoiceLos @tf.function( input_signature=[ { - "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"), - "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"), - "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"), + "input_ids": tf.TensorSpec((None, None, None), tf.int64, name="input_ids"), + "attention_mask": tf.TensorSpec((None, None, None), tf.int64, name="attention_mask"), + "token_type_ids": tf.TensorSpec((None, None, None), tf.int64, name="token_type_ids"), } ] ) diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py index ccae37b3eb..e6e1b0facc 100755 --- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py @@ -67,11 +67,15 @@ LARGE_NEGATIVE = -1e8 def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int): pad_token_id = tf.cast(pad_token_id, input_ids.dtype) decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype) - start_tokens = tf.fill((shape_list(input_ids)[0], 1), decoder_start_token_id) + start_tokens = tf.fill( + (shape_list(input_ids)[0], 1), tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype) + ) shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1) # replace possible -100 values in labels by `pad_token_id` shifted_input_ids = tf.where( - shifted_input_ids == -100, tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids + shifted_input_ids == -100, + tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)), + shifted_input_ids, ) # "Verify that `labels` has only positive values and -100" @@ -591,9 +595,9 @@ class TFSpeech2TextPreTrainedModel(TFPreTrainedModel): input_signature=[ { "input_features": tf.TensorSpec((None, None, None), tf.float32, name="input_features"), - "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"), - "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"), - "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"), + "attention_mask": tf.TensorSpec((None, None), tf.int64, name="attention_mask"), + "decoder_input_ids": tf.TensorSpec((None, None), tf.int64, name="decoder_input_ids"), + "decoder_attention_mask": tf.TensorSpec((None, None), tf.int64, name="decoder_attention_mask"), } ] ) diff --git a/src/transformers/models/t5/modeling_tf_t5.py b/src/transformers/models/t5/modeling_tf_t5.py index b8a9e86ac9..ae39ea2615 100644 --- a/src/transformers/models/t5/modeling_tf_t5.py +++ b/src/transformers/models/t5/modeling_tf_t5.py @@ -872,10 +872,10 @@ class TFT5PreTrainedModel(TFPreTrainedModel): @tf.function( input_signature=[ { - "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"), - "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"), - "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"), - "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"), + "input_ids": tf.TensorSpec((None, None), tf.int64, name="input_ids"), + "attention_mask": tf.TensorSpec((None, None), tf.int64, name="attention_mask"), + "decoder_input_ids": tf.TensorSpec((None, None), tf.int64, name="decoder_input_ids"), + "decoder_attention_mask": tf.TensorSpec((None, None), tf.int64, name="decoder_attention_mask"), } ] ) diff --git a/src/transformers/models/tapas/modeling_tf_tapas.py b/src/transformers/models/tapas/modeling_tf_tapas.py index ea379a039d..48c26a138d 100644 --- a/src/transformers/models/tapas/modeling_tf_tapas.py +++ b/src/transformers/models/tapas/modeling_tf_tapas.py @@ -865,9 +865,9 @@ class TFTapasPreTrainedModel(TFPreTrainedModel): @tf.function( input_signature=[ { - "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"), + "input_ids": tf.TensorSpec((None, None), tf.int64, name="input_ids"), "attention_mask": tf.TensorSpec((None, None), tf.float32, name="attention_mask"), - "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"), + "token_type_ids": tf.TensorSpec((None, None, None), tf.int64, name="token_type_ids"), } ] ) diff --git a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py index b0d26e6edf..53370c1d3e 100644 --- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py +++ b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py @@ -686,7 +686,7 @@ class TFTransfoXLPreTrainedModel(TFPreTrainedModel): @tf.function( input_signature=[ { - "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"), + "input_ids": tf.TensorSpec((None, None), tf.int64, name="input_ids"), } ] ) diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py index 58110b5120..6fee92e672 100644 --- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py @@ -1345,8 +1345,8 @@ class TFWav2Vec2PreTrainedModel(TFPreTrainedModel): input_signature=[ { "input_values": tf.TensorSpec((None, None), tf.float32, name="input_values"), - "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"), - "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"), + "attention_mask": tf.TensorSpec((None, None), tf.int64, name="attention_mask"), + "token_type_ids": tf.TensorSpec((None, None), tf.int64, name="token_type_ids"), } ] ) diff --git a/src/transformers/models/xglm/modeling_tf_xglm.py b/src/transformers/models/xglm/modeling_tf_xglm.py index ac11e7ae7c..6dd62d270b 100644 --- a/src/transformers/models/xglm/modeling_tf_xglm.py +++ b/src/transformers/models/xglm/modeling_tf_xglm.py @@ -636,8 +636,8 @@ class TFXGLMPreTrainedModel(TFPreTrainedModel): @tf.function( input_signature=[ { - "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"), - "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"), + "input_ids": tf.TensorSpec((None, None), tf.int64, name="input_ids"), + "attention_mask": tf.TensorSpec((None, None), tf.int64, name="attention_mask"), } ] ) diff --git a/src/transformers/models/xlnet/modeling_tf_xlnet.py b/src/transformers/models/xlnet/modeling_tf_xlnet.py index 739ad50ecb..1b079d00be 100644 --- a/src/transformers/models/xlnet/modeling_tf_xlnet.py +++ b/src/transformers/models/xlnet/modeling_tf_xlnet.py @@ -1563,9 +1563,9 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss): @tf.function( input_signature=[ { - "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"), - "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"), - "token_type_ids": tf.TensorSpec((None, None, None), tf.int32, name="token_type_ids"), + "input_ids": tf.TensorSpec((None, None, None), tf.int64, name="input_ids"), + "attention_mask": tf.TensorSpec((None, None, None), tf.int64, name="attention_mask"), + "token_type_ids": tf.TensorSpec((None, None, None), tf.int64, name="token_type_ids"), } ] ) diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py index 0d025ca98c..8225105ddf 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py @@ -1685,16 +1685,21 @@ _TOKENIZER_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Tokenizer" LARGE_NEGATIVE = -1e8 +# Copied from transformers.models.bart.modeling_tf_bart.shift_tokens_right def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int): - start_tokens = tf.fill((shape_list(input_ids)[0], 1), decoder_start_token_id) + pad_token_id = tf.cast(pad_token_id, input_ids.dtype) + decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype) + start_tokens = tf.fill((shape_list(input_ids)[0], 1), tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype)) shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1) # replace possible -100 values in labels by `pad_token_id` shifted_input_ids = tf.where( - shifted_input_ids == -100, tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids + shifted_input_ids == -100, + tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)), + shifted_input_ids, ) # "Verify that `labels` has only positive values and -100" - assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0)) + assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=shifted_input_ids.dtype)) # Make sure the assertion op is called by wrapping the result in an identity no-op with tf.control_dependencies([assert_gte0]): diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index ca8840d2aa..0c55b4d8ed 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -1887,9 +1887,9 @@ class UtilsFunctionsTest(unittest.TestCase): return pixel_values, output_attentions, output_hidden_states, return_dict dummy_model = DummyModel() - input_ids = tf.constant([0, 1, 2, 3]) - past = tf.constant([4, 5, 6, 7]) - pixel_values = tf.constant([8, 9, 10, 11]) + input_ids = tf.constant([0, 1, 2, 3], dtype=tf.int64) + past = tf.constant([4, 5, 6, 7], dtype=tf.int64) + pixel_values = tf.constant([8, 9, 10, 11], dtype=tf.int64) # test case 1: Pass inputs as keyword arguments; Booleans are inherited from the config. output = dummy_model.call(input_ids=input_ids, past=past)