Allow saved_model export of TFCLIPModel in save_pretrained (#16886)
* CLIP Serving * Add type hints per code review * Use black, flake8, and isort * Update src/transformers/models/clip/modeling_tf_clip.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * Rollback serving_output and add TODO * Remove irrelevant portions of failing tests * Revert "Rollback serving_output and add TODO" This reverts commit a4abfa6ba3b7875a13538dbc2ddc4eb17dfcca8d. * Rollback to original test/serving_output * Fix unused var * Apply suggestions from code review * Update formatting with black * Fix style again from rebase * Update tests/models/clip/test_modeling_tf_clip.py Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> Co-authored-by: Sean Moriarity <sean.l.moriarity.mil@army.mil> Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
This commit is contained in:
@@ -551,11 +551,14 @@ class TFCLIPTextTransformer(tf.keras.layers.Layer):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def _build_causal_attention_mask(self, batch_size, seq_length, dtype=tf.float32):
|
def _build_causal_attention_mask(self, batch_size, seq_length, dtype=tf.float32):
|
||||||
|
# It is possible with an unspecified sequence length for seq_length to be
|
||||||
diag = tf.constant(0.0, shape=(seq_length,), dtype=dtype)
|
# a runtime value, which is unsupported by tf.constant. Per the TensorFlow
|
||||||
|
# docs, tf.fill can handle runtime dynamic shapes:
|
||||||
|
# https://www.tensorflow.org/api_docs/python/tf/fill
|
||||||
|
diag = tf.cast(tf.fill((seq_length,), 0.0), dtype)
|
||||||
|
|
||||||
# set an additive 2D attention mask with all places being masked
|
# set an additive 2D attention mask with all places being masked
|
||||||
to_mask = tf.constant(-10000.0, shape=(seq_length, seq_length), dtype=dtype)
|
to_mask = tf.cast(tf.fill((seq_length, seq_length), -10000.0), dtype)
|
||||||
|
|
||||||
# set diagonal & lower triangular parts to 0 (i.e. the places not to be masked)
|
# set diagonal & lower triangular parts to 0 (i.e. the places not to be masked)
|
||||||
# TIP: think the 2D matrix as the space of (query_seq, key_seq)
|
# TIP: think the 2D matrix as the space of (query_seq, key_seq)
|
||||||
@@ -1082,6 +1085,18 @@ class TFCLIPTextModel(TFCLIPPreTrainedModel):
|
|||||||
|
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
|
@tf.function(
|
||||||
|
input_signature=[
|
||||||
|
{
|
||||||
|
"input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
|
||||||
|
"attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
def serving(self, inputs: Dict[str, tf.Tensor]) -> TFBaseModelOutputWithPooling:
|
||||||
|
output = self.call(inputs)
|
||||||
|
return self.serving_output(output)
|
||||||
|
|
||||||
def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
|
def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
|
||||||
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
|
hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
|
||||||
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
|
attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
|
||||||
@@ -1123,7 +1138,7 @@ class TFCLIPVisionModel(TFCLIPPreTrainedModel):
|
|||||||
}
|
}
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
def serving(self, inputs):
|
def serving(self, inputs: Dict[str, tf.Tensor]) -> TFBaseModelOutputWithPooling:
|
||||||
"""
|
"""
|
||||||
Method used for serving the model.
|
Method used for serving the model.
|
||||||
|
|
||||||
@@ -1226,7 +1241,7 @@ class TFCLIPModel(TFCLIPPreTrainedModel):
|
|||||||
}
|
}
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
def serving(self, inputs):
|
def serving(self, inputs: Dict[str, tf.Tensor]) -> TFCLIPOutput:
|
||||||
"""
|
"""
|
||||||
Method used for serving the model.
|
Method used for serving the model.
|
||||||
|
|
||||||
@@ -1375,4 +1390,7 @@ class TFCLIPModel(TFCLIPPreTrainedModel):
|
|||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
def serving_output(self, output: TFCLIPOutput) -> TFCLIPOutput:
|
def serving_output(self, output: TFCLIPOutput) -> TFCLIPOutput:
|
||||||
|
# TODO: As is this currently fails with saved_model=True, because
|
||||||
|
# TensorFlow cannot trace through nested dataclasses. Reference:
|
||||||
|
# https://github.com/huggingface/transformers/pull/16886
|
||||||
return output
|
return output
|
||||||
|
|||||||
@@ -256,6 +256,62 @@ class TFCLIPVisionModelTest(TFModelTesterMixin, unittest.TestCase):
|
|||||||
model = TFCLIPVisionModel.from_pretrained(model_name)
|
model = TFCLIPVisionModel.from_pretrained(model_name)
|
||||||
self.assertIsNotNone(model)
|
self.assertIsNotNone(model)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_saved_model_creation_extended(self):
|
||||||
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
config.output_hidden_states = True
|
||||||
|
config.output_attentions = True
|
||||||
|
|
||||||
|
if hasattr(config, "use_cache"):
|
||||||
|
config.use_cache = True
|
||||||
|
|
||||||
|
# in CLIP, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
|
||||||
|
image_size = (self.model_tester.image_size, self.model_tester.image_size)
|
||||||
|
patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
|
||||||
|
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
|
||||||
|
seq_len = num_patches + 1
|
||||||
|
|
||||||
|
for model_class in self.all_model_classes:
|
||||||
|
class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
|
||||||
|
model = model_class(config)
|
||||||
|
num_out = len(model(class_inputs_dict))
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||||
|
model.save_pretrained(tmpdirname, saved_model=True)
|
||||||
|
saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
|
||||||
|
model = tf.keras.models.load_model(saved_model_dir)
|
||||||
|
outputs = model(class_inputs_dict)
|
||||||
|
output_hidden_states = outputs["hidden_states"]
|
||||||
|
output_attentions = outputs["attentions"]
|
||||||
|
|
||||||
|
# Check num outputs
|
||||||
|
self.assertEqual(len(outputs), num_out)
|
||||||
|
|
||||||
|
# Check num layers
|
||||||
|
expected_num_layers = getattr(
|
||||||
|
self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(len(output_hidden_states), expected_num_layers)
|
||||||
|
self.assertEqual(len(output_attentions), self.model_tester.num_hidden_layers)
|
||||||
|
|
||||||
|
# Check attention outputs
|
||||||
|
image_size = (self.model_tester.image_size, self.model_tester.image_size)
|
||||||
|
patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
|
||||||
|
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
|
||||||
|
seq_len = num_patches + 1
|
||||||
|
|
||||||
|
self.assertListEqual(
|
||||||
|
list(output_attentions[0].shape[-3:]),
|
||||||
|
[self.model_tester.num_attention_heads, seq_len, seq_len],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check hidden states
|
||||||
|
self.assertListEqual(
|
||||||
|
list(output_hidden_states[0].shape[-2:]),
|
||||||
|
[seq_len, self.model_tester.hidden_size],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class TFCLIPTextModelTester:
|
class TFCLIPTextModelTester:
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -367,6 +423,54 @@ class TFCLIPTextModelTest(TFModelTesterMixin, unittest.TestCase):
|
|||||||
model = TFCLIPTextModel.from_pretrained(model_name)
|
model = TFCLIPTextModel.from_pretrained(model_name)
|
||||||
self.assertIsNotNone(model)
|
self.assertIsNotNone(model)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_saved_model_creation_extended(self):
|
||||||
|
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
|
config.output_hidden_states = True
|
||||||
|
config.output_attentions = True
|
||||||
|
|
||||||
|
if hasattr(config, "use_cache"):
|
||||||
|
config.use_cache = True
|
||||||
|
|
||||||
|
for model_class in self.all_model_classes:
|
||||||
|
class_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
|
||||||
|
model = model_class(config)
|
||||||
|
num_out = len(model(class_inputs_dict))
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||||
|
model.save_pretrained(tmpdirname, saved_model=True)
|
||||||
|
saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
|
||||||
|
model = tf.keras.models.load_model(saved_model_dir)
|
||||||
|
outputs = model(class_inputs_dict)
|
||||||
|
output_hidden_states = outputs["hidden_states"]
|
||||||
|
output_attentions = outputs["attentions"]
|
||||||
|
|
||||||
|
# Check number of outputs
|
||||||
|
self.assertEqual(len(outputs), num_out)
|
||||||
|
|
||||||
|
# Check number of layers
|
||||||
|
expected_num_layers = getattr(
|
||||||
|
self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check hidden states
|
||||||
|
self.assertEqual(len(output_hidden_states), expected_num_layers)
|
||||||
|
self.assertListEqual(
|
||||||
|
list(output_hidden_states[0].shape[-2:]),
|
||||||
|
[self.model_tester.seq_length, self.model_tester.hidden_size],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check attention outputs
|
||||||
|
self.assertEqual(len(output_attentions), self.model_tester.num_hidden_layers)
|
||||||
|
|
||||||
|
seq_length = self.model_tester.seq_length
|
||||||
|
key_length = getattr(self.model_tester, "key_length", seq_length)
|
||||||
|
|
||||||
|
self.assertListEqual(
|
||||||
|
list(output_attentions[0].shape[-3:]),
|
||||||
|
[self.model_tester.num_attention_heads, seq_length, key_length],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class TFCLIPModelTester:
|
class TFCLIPModelTester:
|
||||||
def __init__(self, parent, is_training=True):
|
def __init__(self, parent, is_training=True):
|
||||||
@@ -502,6 +606,11 @@ class TFCLIPModelTest(TFModelTesterMixin, unittest.TestCase):
|
|||||||
model = TFCLIPModel.from_pretrained(model_name)
|
model = TFCLIPModel.from_pretrained(model_name)
|
||||||
self.assertIsNotNone(model)
|
self.assertIsNotNone(model)
|
||||||
|
|
||||||
|
@unittest.skip(reason="Currently `saved_model` doesn't work with nested outputs.")
|
||||||
|
@slow
|
||||||
|
def test_saved_model_creation_extended(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
# We will verify our results on an image of cute cats
|
# We will verify our results on an image of cute cats
|
||||||
def prepare_img():
|
def prepare_img():
|
||||||
|
|||||||
Reference in New Issue
Block a user