Add TFViTModel (#13778)
* Start the work for TFViTModel * Convert to TF code - need to check in the follow up commits * Clean up model code * Expose TFViTModel * make style * make quality * Add test * make style & quality * Fix some imports * fix wrong usage - *kwargs => ** kwargs * Fix Conv2D weight loading (PT->TF) issue * Add tests for images with different sizes + fix model * Fix some common tests for TFViTModel * Use inputs instead of input_ids in test_compile_tf_model * Add a comment about transpose and Conv2D in convert_tf_weight_name_to_pt_weight_name * Avoid transpose in TFViT call * Fix Conv2D issue in load_tf2_weights_in_pytorch_model * Use tf.keras.layers.Conv2D instead of tf.nn.conv2d * Using simpler heuristic to detect Conv2D layer * Change convert_tf_weight_name_to_pt_weight_name to return TransposeType * Check tf_weight_shape is not None before using it * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * fix missing comma * fix input dtype Co-authored-by: ydshieh <ydshieh@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
@@ -49,6 +49,7 @@ if is_tf_available():
|
||||
|
||||
from transformers import (
|
||||
TF_MODEL_FOR_CAUSAL_LM_MAPPING,
|
||||
TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
|
||||
TF_MODEL_FOR_MASKED_LM_MAPPING,
|
||||
TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
|
||||
TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
|
||||
@@ -126,7 +127,10 @@ class TFModelTesterMixin:
|
||||
elif model_class in get_values(TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING):
|
||||
inputs_dict["start_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
|
||||
inputs_dict["end_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
|
||||
elif model_class in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
|
||||
elif model_class in [
|
||||
*get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
|
||||
*get_values(TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
|
||||
]:
|
||||
inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
|
||||
elif model_class in get_values(TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING):
|
||||
inputs_dict["next_sentence_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
|
||||
@@ -460,6 +464,8 @@ class TFModelTesterMixin:
|
||||
pt_inputs_dict[name] = key
|
||||
elif name == "input_values":
|
||||
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
|
||||
elif name == "pixel_values":
|
||||
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
|
||||
else:
|
||||
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
|
||||
|
||||
@@ -504,6 +510,8 @@ class TFModelTesterMixin:
|
||||
pt_inputs_dict[name] = torch.from_numpy(key).to(torch.long)
|
||||
elif name == "input_values":
|
||||
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
|
||||
elif name == "pixel_values":
|
||||
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
|
||||
else:
|
||||
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
|
||||
# need to rename encoder-decoder "inputs" for PyTorch
|
||||
@@ -605,7 +613,7 @@ class TFModelTesterMixin:
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
if self.is_encoder_decoder:
|
||||
input_ids = {
|
||||
inputs = {
|
||||
"decoder_input_ids": tf.keras.Input(
|
||||
batch_shape=(2, max_input),
|
||||
name="decoder_input_ids",
|
||||
@@ -613,10 +621,22 @@ class TFModelTesterMixin:
|
||||
),
|
||||
"input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"),
|
||||
}
|
||||
# TODO: A better way to handle vision models
|
||||
elif model_class.__name__ in ["TFViTModel", "TFViTForImageClassification"]:
|
||||
inputs = tf.keras.Input(
|
||||
batch_shape=(
|
||||
3,
|
||||
self.model_tester.num_channels,
|
||||
self.model_tester.image_size,
|
||||
self.model_tester.image_size,
|
||||
),
|
||||
name="pixel_values",
|
||||
dtype="float32",
|
||||
)
|
||||
elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
|
||||
input_ids = tf.keras.Input(batch_shape=(4, 2, max_input), name="input_ids", dtype="int32")
|
||||
inputs = tf.keras.Input(batch_shape=(4, 2, max_input), name="input_ids", dtype="int32")
|
||||
else:
|
||||
input_ids = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32")
|
||||
inputs = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32")
|
||||
|
||||
# Prepare our model
|
||||
model = model_class(config)
|
||||
@@ -626,14 +646,14 @@ class TFModelTesterMixin:
|
||||
model.save_pretrained(tmpdirname, saved_model=False)
|
||||
model = model_class.from_pretrained(tmpdirname)
|
||||
|
||||
outputs_dict = model(input_ids)
|
||||
outputs_dict = model(inputs)
|
||||
hidden_states = outputs_dict[0]
|
||||
|
||||
# Add a dense layer on top to test integration with other keras modules
|
||||
outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)
|
||||
|
||||
# Compile extended model
|
||||
extended_model = tf.keras.Model(inputs=[input_ids], outputs=[outputs])
|
||||
extended_model = tf.keras.Model(inputs=[inputs], outputs=[outputs])
|
||||
extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
|
||||
|
||||
def test_keyword_and_dict_args(self):
|
||||
@@ -647,6 +667,8 @@ class TFModelTesterMixin:
|
||||
|
||||
inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
|
||||
input_ids = inputs_keywords.pop("input_ids", None)
|
||||
if input_ids is None:
|
||||
input_ids = inputs_keywords.pop("pixel_values", None)
|
||||
outputs_keywords = model(input_ids, **inputs_keywords)
|
||||
output_dict = outputs_dict[0].numpy()
|
||||
output_keywords = outputs_keywords[0].numpy()
|
||||
@@ -1236,7 +1258,8 @@ class TFModelTesterMixin:
|
||||
|
||||
# Test that model correctly compute the loss with kwargs
|
||||
prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
|
||||
input_ids = prepared_for_class.pop("input_ids")
|
||||
input_name = "input_ids" if "input_ids" in prepared_for_class else "pixel_values"
|
||||
input_ids = prepared_for_class.pop(input_name)
|
||||
|
||||
loss = model(input_ids, **prepared_for_class)[0]
|
||||
self.assertEqual(loss.shape, [loss_size])
|
||||
@@ -1255,7 +1278,7 @@ class TFModelTesterMixin:
|
||||
signature_names = list(signature.keys())
|
||||
|
||||
# Create a dictionary holding the location of the tensors in the tuple
|
||||
tuple_index_mapping = {0: "input_ids"}
|
||||
tuple_index_mapping = {0: input_name}
|
||||
for label_key in label_keys:
|
||||
label_key_index = signature_names.index(label_key)
|
||||
tuple_index_mapping[label_key_index] = label_key
|
||||
|
||||
Reference in New Issue
Block a user