Add TFSpeech2Text (#15113)
* Add wrapper classes * convert inner layers to tf * Add TF Encoder and Decoder layers * TFSpeech2Text models * Loadable model * TF model with same outputs as PT model * test skeleton * correct tests and run the fixup * correct attention expansion * TFSpeech2Text pask_key_values with TF format
This commit is contained in:
@@ -57,6 +57,7 @@ if is_tf_available():
|
||||
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
|
||||
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
|
||||
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
|
||||
TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
|
||||
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
|
||||
BertConfig,
|
||||
TFAutoModel,
|
||||
@@ -140,6 +141,7 @@ class TFModelTesterMixin:
|
||||
*get_values(TF_MODEL_FOR_MASKED_LM_MAPPING),
|
||||
*get_values(TF_MODEL_FOR_PRETRAINING_MAPPING),
|
||||
*get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
|
||||
*get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING),
|
||||
]:
|
||||
inputs_dict["labels"] = tf.zeros(
|
||||
(self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32
|
||||
@@ -358,7 +360,6 @@ class TFModelTesterMixin:
|
||||
pt_model = pt_model_class(config)
|
||||
|
||||
# Check we can load pt model in tf and vice-versa with model => model functions
|
||||
|
||||
tf_model = transformers.load_pytorch_model_in_tf2_model(
|
||||
tf_model, pt_model, tf_inputs=self._prepare_for_class(inputs_dict, model_class)
|
||||
)
|
||||
@@ -374,6 +375,8 @@ class TFModelTesterMixin:
|
||||
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
|
||||
elif name == "pixel_values":
|
||||
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
|
||||
elif name == "input_features":
|
||||
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
|
||||
else:
|
||||
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
|
||||
|
||||
@@ -416,6 +419,8 @@ class TFModelTesterMixin:
|
||||
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
|
||||
elif name == "pixel_values":
|
||||
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
|
||||
elif name == "input_features":
|
||||
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
|
||||
else:
|
||||
pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
|
||||
|
||||
@@ -443,7 +448,24 @@ class TFModelTesterMixin:
|
||||
metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
if self.is_encoder_decoder:
|
||||
if model_class.__name__ in ["TFSpeech2TextModel", "TFSpeech2TextForConditionalGeneration"]:
|
||||
inputs = {
|
||||
"decoder_input_ids": tf.keras.Input(
|
||||
batch_shape=(2, max_input),
|
||||
name="decoder_input_ids",
|
||||
dtype="int32",
|
||||
),
|
||||
"input_features": tf.keras.Input(
|
||||
batch_shape=(
|
||||
2,
|
||||
max_input,
|
||||
self.model_tester.input_feat_per_channel * self.model_tester.input_channels,
|
||||
),
|
||||
name="input_features",
|
||||
dtype="float32",
|
||||
),
|
||||
}
|
||||
elif self.is_encoder_decoder:
|
||||
inputs = {
|
||||
"decoder_input_ids": tf.keras.Input(
|
||||
batch_shape=(2, max_input),
|
||||
@@ -511,10 +533,7 @@ class TFModelTesterMixin:
|
||||
outputs_dict = model(inputs)
|
||||
|
||||
inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
|
||||
input_ids = inputs_keywords.pop("input_ids", None)
|
||||
if input_ids is None:
|
||||
input_ids = inputs_keywords.pop("pixel_values", None)
|
||||
outputs_keywords = model(input_ids, **inputs_keywords)
|
||||
outputs_keywords = model(**inputs_keywords)
|
||||
output_dict = outputs_dict[0].numpy()
|
||||
output_keywords = outputs_keywords[0].numpy()
|
||||
|
||||
@@ -699,23 +718,28 @@ class TFModelTesterMixin:
|
||||
|
||||
def test_model_common_attributes(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
list_lm_models = (
|
||||
text_in_text_out_models = (
|
||||
get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING)
|
||||
+ get_values(TF_MODEL_FOR_MASKED_LM_MAPPING)
|
||||
+ get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING)
|
||||
)
|
||||
speech_in_text_out_models = get_values(TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING)
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
|
||||
|
||||
if model_class in list_lm_models:
|
||||
if model_class in text_in_text_out_models:
|
||||
x = model.get_output_embeddings()
|
||||
assert isinstance(x, tf.keras.layers.Layer)
|
||||
name = model.get_bias()
|
||||
assert isinstance(name, dict)
|
||||
for k, v in name.items():
|
||||
assert isinstance(v, tf.Variable)
|
||||
elif model_class in speech_in_text_out_models:
|
||||
x = model.get_output_embeddings()
|
||||
assert isinstance(x, tf.keras.layers.Layer)
|
||||
name = model.get_bias()
|
||||
assert name is None
|
||||
else:
|
||||
x = model.get_output_embeddings()
|
||||
assert x is None
|
||||
@@ -922,13 +946,13 @@ class TFModelTesterMixin:
|
||||
model = model_class(config)
|
||||
|
||||
if config.bos_token_id is None:
|
||||
# if bos token id is not defined mobel needs input_ids
|
||||
# if bos token id is not defined model needs input_ids
|
||||
with self.assertRaises(AssertionError):
|
||||
model.generate(do_sample=True, max_length=5)
|
||||
# num_return_sequences = 1
|
||||
self._check_generated_ids(model.generate(input_ids, do_sample=True))
|
||||
else:
|
||||
# num_return_sequences = 1
|
||||
elif model_class.__name__ not in ["TFSpeech2TextForConditionalGeneration"]:
|
||||
# Models with non-text inputs won't work here; num_return_sequences = 1
|
||||
self._check_generated_ids(model.generate(do_sample=True, max_length=5))
|
||||
|
||||
with self.assertRaises(AssertionError):
|
||||
@@ -952,6 +976,8 @@ class TFModelTesterMixin:
|
||||
def test_lm_head_model_no_beam_search_generate_dict_outputs(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
input_ids = inputs_dict.get("input_ids", None)
|
||||
if input_ids is None:
|
||||
input_ids = inputs_dict.get("input_features", None)
|
||||
|
||||
# iterate over all generative models
|
||||
for model_class in self.all_generative_model_classes:
|
||||
@@ -988,7 +1014,7 @@ class TFModelTesterMixin:
|
||||
model = model_class(config)
|
||||
|
||||
if config.bos_token_id is None:
|
||||
# if bos token id is not defined mobel needs input_ids, num_return_sequences = 1
|
||||
# if bos token id is not defined model needs input_ids, num_return_sequences = 1
|
||||
self._check_generated_ids(model.generate(input_ids, do_sample=True, num_beams=2))
|
||||
else:
|
||||
# num_return_sequences = 1
|
||||
@@ -1023,6 +1049,8 @@ class TFModelTesterMixin:
|
||||
def test_lm_head_model_beam_search_generate_dict_outputs(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
input_ids = inputs_dict.get("input_ids", None)
|
||||
if input_ids is None:
|
||||
input_ids = inputs_dict.get("input_features", None)
|
||||
|
||||
# iterate over all generative models
|
||||
for model_class in self.all_generative_model_classes:
|
||||
@@ -1072,10 +1100,11 @@ class TFModelTesterMixin:
|
||||
|
||||
# Test that model correctly compute the loss with kwargs
|
||||
prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
|
||||
input_name = "input_ids" if "input_ids" in prepared_for_class else "pixel_values"
|
||||
input_ids = prepared_for_class.pop(input_name)
|
||||
possible_input_names = {"input_ids", "pixel_values", "input_features"}
|
||||
input_name = possible_input_names.intersection(set(prepared_for_class)).pop()
|
||||
model_input = prepared_for_class.pop(input_name)
|
||||
|
||||
loss = model(input_ids, **prepared_for_class)[0]
|
||||
loss = model(model_input, **prepared_for_class)[0]
|
||||
self.assertEqual(loss.shape, [loss_size])
|
||||
|
||||
# Test that model correctly compute the loss with a dict
|
||||
|
||||
Reference in New Issue
Block a user