Switch return_dict to True by default. (#8530)
* Use the CI to identify failing tests * Remove from all examples and tests * More default switch * Fixes * More test fixes * More fixes * Last fixes hopefully * Use the CI to identify failing tests * Remove from all examples and tests * More default switch * Fixes * More test fixes * More fixes * Last fixes hopefully * Run on the real suite * Fix slow tests
This commit is contained in:
@@ -118,7 +118,7 @@ class GenerationTesterMixin:
|
||||
@staticmethod
|
||||
def _get_encoder_outputs(model, input_ids, attention_mask, num_interleave=1):
|
||||
encoder = model.get_encoder()
|
||||
encoder_outputs = encoder(input_ids, attention_mask=attention_mask, return_dict=True)
|
||||
encoder_outputs = encoder(input_ids, attention_mask=attention_mask)
|
||||
encoder_outputs["last_hidden_state"] = encoder_outputs.last_hidden_state.repeat_interleave(
|
||||
num_interleave, dim=0
|
||||
)
|
||||
@@ -344,6 +344,7 @@ class GenerationTesterMixin:
|
||||
def test_beam_sample_generate(self):
|
||||
for model_class in self.all_generative_model_classes:
|
||||
config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
|
||||
print("Return dict", config.return_dict)
|
||||
logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=1)
|
||||
|
||||
model = model_class(config).to(torch_device)
|
||||
|
||||
@@ -102,7 +102,6 @@ class AlbertModelTester:
|
||||
type_vocab_size=self.type_vocab_size,
|
||||
initializer_range=self.initializer_range,
|
||||
num_hidden_groups=self.num_hidden_groups,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
|
||||
@@ -259,7 +259,6 @@ class BartHeadTests(unittest.TestCase):
|
||||
eos_token_id=2,
|
||||
pad_token_id=1,
|
||||
bos_token_id=0,
|
||||
return_dict=True,
|
||||
)
|
||||
return config, input_ids, batch_size
|
||||
|
||||
@@ -310,7 +309,6 @@ class BartHeadTests(unittest.TestCase):
|
||||
encoder_ffn_dim=8,
|
||||
decoder_ffn_dim=8,
|
||||
max_position_embeddings=48,
|
||||
return_dict=True,
|
||||
)
|
||||
lm_model = BartForConditionalGeneration(config).to(torch_device)
|
||||
context = torch.Tensor([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]]).long().to(torch_device)
|
||||
@@ -713,6 +711,6 @@ class FastIntegrationTests(unittest.TestCase):
|
||||
padding="longest",
|
||||
truncation=True,
|
||||
)
|
||||
features = self.xsum_1_1_model.get_encoder()(**batch, return_dict=True).last_hidden_state
|
||||
features = self.xsum_1_1_model.get_encoder()(**batch).last_hidden_state
|
||||
expected = [[-0.0828, -0.0251, -0.0674], [0.1277, 0.3311, -0.0255], [0.2613, -0.0840, -0.2763]]
|
||||
assert_tensors_close(features[0, :3, :3], torch.tensor(expected), atol=1e-3)
|
||||
|
||||
@@ -124,7 +124,6 @@ class BertModelTester:
|
||||
type_vocab_size=self.type_vocab_size,
|
||||
is_decoder=False,
|
||||
initializer_range=self.initializer_range,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
|
||||
@@ -89,7 +89,6 @@ class BertGenerationEncoderTester:
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
is_decoder=False,
|
||||
initializer_range=self.initializer_range,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return config, input_ids, input_mask, token_labels
|
||||
|
||||
@@ -31,7 +31,7 @@ if is_torch_available():
|
||||
class CamembertModelIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
def test_output_embeds_base_model(self):
|
||||
model = CamembertModel.from_pretrained("camembert-base", return_dict=True)
|
||||
model = CamembertModel.from_pretrained("camembert-base")
|
||||
model.to(torch_device)
|
||||
|
||||
input_ids = torch.tensor(
|
||||
|
||||
@@ -657,7 +657,7 @@ class ModelTesterMixin:
|
||||
model.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model(**self._prepare_for_class(inputs_dict, model_class), return_dict=True)
|
||||
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
|
||||
hidden_states = outputs["hidden_states"] if "hidden_states" in outputs else outputs[-1]
|
||||
|
||||
expected_num_layers = getattr(
|
||||
|
||||
@@ -94,7 +94,6 @@ class CTRLModelTester:
|
||||
n_ctx=self.max_position_embeddings,
|
||||
# type_vocab_size=self.type_vocab_size,
|
||||
# initializer_range=self.initializer_range,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
|
||||
|
||||
@@ -148,7 +148,7 @@ class DebertaModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
|
||||
def check_loss_output(self, result):
|
||||
self.parent.assertListEqual(list(result["loss"].size()), [])
|
||||
self.parent.assertListEqual(list(result.loss.size()), [])
|
||||
|
||||
def create_and_check_deberta_model(
|
||||
self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
@@ -160,11 +160,8 @@ class DebertaModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
sequence_output = model(input_ids, token_type_ids=token_type_ids)[0]
|
||||
sequence_output = model(input_ids)[0]
|
||||
|
||||
result = {
|
||||
"sequence_output": sequence_output,
|
||||
}
|
||||
self.parent.assertListEqual(
|
||||
list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
|
||||
list(sequence_output.size()), [self.batch_size, self.seq_length, self.hidden_size]
|
||||
)
|
||||
|
||||
def create_and_check_deberta_for_sequence_classification(
|
||||
@@ -174,14 +171,8 @@ class DebertaModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
model = DebertaForSequenceClassification(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
loss, logits = model(
|
||||
input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels
|
||||
)
|
||||
result = {
|
||||
"loss": loss,
|
||||
"logits": logits,
|
||||
}
|
||||
self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels])
|
||||
result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
|
||||
self.parent.assertListEqual(list(result.logits.size()), [self.batch_size, self.num_labels])
|
||||
self.check_loss_output(result)
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
|
||||
@@ -110,7 +110,6 @@ if is_torch_available():
|
||||
attention_dropout=self.attention_probs_dropout_prob,
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
initializer_range=self.initializer_range,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
|
||||
@@ -117,7 +117,6 @@ class DPRModelTester:
|
||||
type_vocab_size=self.type_vocab_size,
|
||||
is_decoder=False,
|
||||
initializer_range=self.initializer_range,
|
||||
return_dict=True,
|
||||
)
|
||||
config = DPRConfig(projection_dim=self.projection_dim, **config.to_dict())
|
||||
|
||||
|
||||
@@ -101,7 +101,6 @@ class ElectraModelTester:
|
||||
type_vocab_size=self.type_vocab_size,
|
||||
is_decoder=False,
|
||||
initializer_range=self.initializer_range,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return (
|
||||
|
||||
@@ -85,7 +85,6 @@ class EncoderDecoderMixin:
|
||||
decoder_input_ids=decoder_input_ids,
|
||||
attention_mask=attention_mask,
|
||||
decoder_attention_mask=decoder_attention_mask,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
@@ -117,7 +116,6 @@ class EncoderDecoderMixin:
|
||||
decoder_input_ids=decoder_input_ids,
|
||||
attention_mask=attention_mask,
|
||||
decoder_attention_mask=decoder_attention_mask,
|
||||
return_dict=True,
|
||||
)
|
||||
self.assertEqual(
|
||||
outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,))
|
||||
@@ -132,7 +130,6 @@ class EncoderDecoderMixin:
|
||||
decoder_input_ids=decoder_input_ids,
|
||||
attention_mask=attention_mask,
|
||||
decoder_attention_mask=decoder_attention_mask,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
@@ -278,7 +275,6 @@ class EncoderDecoderMixin:
|
||||
attention_mask=attention_mask,
|
||||
decoder_attention_mask=decoder_attention_mask,
|
||||
labels=labels,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
loss = outputs_encoder_decoder["loss"]
|
||||
@@ -313,7 +309,6 @@ class EncoderDecoderMixin:
|
||||
attention_mask=attention_mask,
|
||||
decoder_attention_mask=decoder_attention_mask,
|
||||
output_attentions=True,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
encoder_attentions = outputs_encoder_decoder["encoder_attentions"]
|
||||
|
||||
@@ -113,7 +113,6 @@ class FlaubertModelTester(object):
|
||||
initializer_range=self.initializer_range,
|
||||
summary_type=self.summary_type,
|
||||
use_proj=self.use_proj,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return (
|
||||
|
||||
@@ -29,7 +29,7 @@ class FlaxBertModelTest(unittest.TestCase):
|
||||
# Check for simple input
|
||||
pt_inputs = tokenizer.encode_plus("This is a simple input", return_tensors=TensorType.PYTORCH)
|
||||
fx_inputs = tokenizer.encode_plus("This is a simple input", return_tensors=TensorType.JAX)
|
||||
pt_outputs = pt_model(**pt_inputs)
|
||||
pt_outputs = pt_model(**pt_inputs).to_tuple()
|
||||
fx_outputs = fx_model(**fx_inputs)
|
||||
|
||||
self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
|
||||
|
||||
@@ -34,7 +34,7 @@ class FlaxRobertaModelTest(unittest.TestCase):
|
||||
|
||||
self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
|
||||
|
||||
for fx_output, pt_output in zip(fx_outputs, pt_outputs):
|
||||
for fx_output, pt_output in zip(fx_outputs, pt_outputs.to_tuple()):
|
||||
self.assert_almost_equals(fx_output, pt_output.numpy(), 5e-4)
|
||||
|
||||
def assert_almost_equals(self, a: ndarray, b: ndarray, tol: float):
|
||||
|
||||
@@ -259,7 +259,6 @@ class FSMTHeadTests(unittest.TestCase):
|
||||
eos_token_id=2,
|
||||
pad_token_id=1,
|
||||
bos_token_id=0,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
def _get_config_and_data(self):
|
||||
|
||||
@@ -140,7 +140,6 @@ class FunnelModelTester:
|
||||
activation_dropout=self.activation_dropout,
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
type_vocab_size=self.type_vocab_size,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return (
|
||||
|
||||
@@ -131,7 +131,6 @@ class GPT2ModelTester:
|
||||
bos_token_id=self.bos_token_id,
|
||||
eos_token_id=self.eos_token_id,
|
||||
pad_token_id=self.pad_token_id,
|
||||
return_dict=True,
|
||||
gradient_checkpointing=gradient_checkpointing,
|
||||
)
|
||||
|
||||
|
||||
@@ -125,7 +125,6 @@ class LayoutLMModelTester:
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
type_vocab_size=self.type_vocab_size,
|
||||
initializer_range=self.initializer_range,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
|
||||
@@ -113,7 +113,6 @@ class LongformerModelTester:
|
||||
type_vocab_size=self.type_vocab_size,
|
||||
initializer_range=self.initializer_range,
|
||||
attention_window=self.attention_window,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
|
||||
@@ -282,7 +282,6 @@ class LxmertModelTester:
|
||||
attention_mask=input_mask,
|
||||
labels=ans,
|
||||
output_attentions=output_attentions,
|
||||
return_dict=True,
|
||||
)
|
||||
result = model(input_ids, visual_feats, bounding_boxes, labels=ans)
|
||||
result = model(
|
||||
@@ -302,7 +301,6 @@ class LxmertModelTester:
|
||||
attention_mask=input_mask,
|
||||
labels=ans,
|
||||
output_attentions=not output_attentions,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
self.parent.assertEqual(result.question_answering_score.shape, (self.batch_size, self.num_qa_labels))
|
||||
@@ -335,7 +333,6 @@ class LxmertModelTester:
|
||||
matched_label=matched_label,
|
||||
ans=ans,
|
||||
output_attentions=output_attentions,
|
||||
return_dict=True,
|
||||
)
|
||||
result = model(
|
||||
input_ids,
|
||||
@@ -390,7 +387,6 @@ class LxmertModelTester:
|
||||
matched_label=matched_label,
|
||||
ans=ans,
|
||||
output_attentions=not output_attentions,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
|
||||
@@ -427,7 +423,6 @@ class LxmertModelTester:
|
||||
token_type_ids=token_type_ids,
|
||||
attention_mask=input_mask,
|
||||
ans=ans,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
result_qa = model_qa(
|
||||
@@ -437,7 +432,6 @@ class LxmertModelTester:
|
||||
labels=ans,
|
||||
token_type_ids=token_type_ids,
|
||||
attention_mask=input_mask,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
model_pretrain.resize_num_qa_labels(num_small_labels)
|
||||
@@ -450,7 +444,6 @@ class LxmertModelTester:
|
||||
token_type_ids=token_type_ids,
|
||||
attention_mask=input_mask,
|
||||
ans=less_labels_ans,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
result_qa_less = model_qa(
|
||||
@@ -460,7 +453,6 @@ class LxmertModelTester:
|
||||
labels=less_labels_ans,
|
||||
token_type_ids=token_type_ids,
|
||||
attention_mask=input_mask,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
model_pretrain.resize_num_qa_labels(num_large_labels)
|
||||
@@ -473,7 +465,6 @@ class LxmertModelTester:
|
||||
token_type_ids=token_type_ids,
|
||||
attention_mask=input_mask,
|
||||
ans=more_labels_ans,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
result_qa_more = model_qa(
|
||||
@@ -483,7 +474,6 @@ class LxmertModelTester:
|
||||
labels=more_labels_ans,
|
||||
token_type_ids=token_type_ids,
|
||||
attention_mask=input_mask,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
model_qa_labels = model_qa.num_qa_labels
|
||||
|
||||
@@ -50,7 +50,6 @@ class ModelTester:
|
||||
decoder_ffn_dim=32,
|
||||
max_position_embeddings=48,
|
||||
add_final_layer_norm=True,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
|
||||
@@ -37,7 +37,6 @@ class ModelTester:
|
||||
decoder_ffn_dim=32,
|
||||
max_position_embeddings=48,
|
||||
add_final_layer_norm=True,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
@@ -132,7 +131,6 @@ class MBartEnroIntegrationTest(AbstractSeq2SeqIntegrationTest):
|
||||
decoder_ffn_dim=32,
|
||||
max_position_embeddings=48,
|
||||
add_final_layer_norm=True,
|
||||
return_dict=True,
|
||||
)
|
||||
lm_model = MBartForConditionalGeneration(config).to(torch_device)
|
||||
context = torch.Tensor([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]]).long().to(torch_device)
|
||||
|
||||
@@ -124,7 +124,6 @@ class MobileBertModelTester:
|
||||
type_vocab_size=self.type_vocab_size,
|
||||
is_decoder=False,
|
||||
initializer_range=self.initializer_range,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
|
||||
@@ -94,7 +94,6 @@ class OpenAIGPTModelTester:
|
||||
# type_vocab_size=self.type_vocab_size,
|
||||
# initializer_range=self.initializer_range
|
||||
pad_token_id=self.pad_token_id,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
|
||||
|
||||
@@ -33,7 +33,6 @@ class ModelTester:
|
||||
decoder_ffn_dim=32,
|
||||
max_position_embeddings=48,
|
||||
add_final_layer_norm=True,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
|
||||
@@ -142,7 +142,6 @@ class ProphetNetModelTester:
|
||||
disable_ngram_loss=self.disable_ngram_loss,
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
is_encoder_decoder=self.is_encoder_decoder,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return (
|
||||
@@ -344,7 +343,6 @@ class ProphetNetModelTester:
|
||||
decoder_input_ids=decoder_input_ids,
|
||||
attention_mask=attention_mask,
|
||||
decoder_attention_mask=decoder_attention_mask,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
tied_model_result = tied_model(
|
||||
@@ -352,7 +350,6 @@ class ProphetNetModelTester:
|
||||
decoder_input_ids=decoder_input_ids,
|
||||
attention_mask=attention_mask,
|
||||
decoder_attention_mask=decoder_attention_mask,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
# check that models has less parameters
|
||||
@@ -419,7 +416,6 @@ class ProphetNetModelTester:
|
||||
attention_mask=attention_mask,
|
||||
decoder_attention_mask=decoder_attention_mask,
|
||||
labels=lm_labels,
|
||||
return_dict=True,
|
||||
)
|
||||
self.parent.assertTrue(torch.allclose(result.loss, torch.tensor(128.2925, device=torch_device), atol=1e-3))
|
||||
|
||||
@@ -433,9 +429,7 @@ class ProphetNetModelTester:
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
outputs_no_mask = model(
|
||||
input_ids=input_ids[:, :5], decoder_input_ids=decoder_input_ids[:, :5], return_dict=True
|
||||
)
|
||||
outputs_no_mask = model(input_ids=input_ids[:, :5], decoder_input_ids=decoder_input_ids[:, :5])
|
||||
attention_mask = torch.ones_like(input_ids)
|
||||
decoder_attention_mask = torch.ones_like(decoder_input_ids)
|
||||
|
||||
@@ -446,7 +440,6 @@ class ProphetNetModelTester:
|
||||
attention_mask=attention_mask,
|
||||
decoder_input_ids=decoder_input_ids,
|
||||
decoder_attention_mask=decoder_attention_mask,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
# check encoder
|
||||
@@ -524,7 +517,6 @@ class ProphetNetStandaloneDecoderModelTester:
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
ngram=2,
|
||||
return_dict=True,
|
||||
num_buckets=32,
|
||||
relative_max_distance=128,
|
||||
disable_ngram_loss=False,
|
||||
@@ -562,7 +554,6 @@ class ProphetNetStandaloneDecoderModelTester:
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.add_cross_attention = add_cross_attention
|
||||
self.is_encoder_decoder = is_encoder_decoder
|
||||
self.return_dict = return_dict
|
||||
|
||||
self.scope = None
|
||||
self.decoder_key_length = decoder_seq_length
|
||||
@@ -602,7 +593,6 @@ class ProphetNetStandaloneDecoderModelTester:
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
add_cross_attention=self.add_cross_attention,
|
||||
is_encoder_decoder=self.is_encoder_decoder,
|
||||
return_dict=self.return_dict,
|
||||
)
|
||||
|
||||
return (
|
||||
@@ -757,7 +747,6 @@ class ProphetNetStandaloneEncoderModelTester:
|
||||
pad_token_id=0,
|
||||
bos_token_id=1,
|
||||
eos_token_id=2,
|
||||
return_dict=True,
|
||||
num_buckets=32,
|
||||
relative_max_distance=128,
|
||||
disable_ngram_loss=False,
|
||||
@@ -794,7 +783,6 @@ class ProphetNetStandaloneEncoderModelTester:
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.add_cross_attention = add_cross_attention
|
||||
self.is_encoder_decoder = is_encoder_decoder
|
||||
self.return_dict = return_dict
|
||||
|
||||
self.scope = None
|
||||
self.decoder_key_length = decoder_seq_length
|
||||
@@ -829,7 +817,6 @@ class ProphetNetStandaloneEncoderModelTester:
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
add_cross_attention=self.add_cross_attention,
|
||||
is_encoder_decoder=self.is_encoder_decoder,
|
||||
return_dict=self.return_dict,
|
||||
)
|
||||
|
||||
return (
|
||||
@@ -919,7 +906,6 @@ class ProphetNetModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
|
||||
# methods overwrite method in `test_modeling_common.py`
|
||||
def test_attention_outputs(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
config.return_dict = True
|
||||
|
||||
seq_len = getattr(self.model_tester, "seq_length", None)
|
||||
decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
|
||||
@@ -933,7 +919,6 @@ class ProphetNetModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
|
||||
for model_class in self.all_model_classes:
|
||||
inputs_dict["output_attentions"] = True
|
||||
inputs_dict["output_hidden_states"] = False
|
||||
config.return_dict = True
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
@@ -1121,7 +1106,6 @@ class ProphetNetModelIntegrationTest(unittest.TestCase):
|
||||
attention_mask=None,
|
||||
encoder_outputs=None,
|
||||
decoder_input_ids=decoder_prev_ids,
|
||||
return_dict=True,
|
||||
)
|
||||
output_predited_logits = output[0]
|
||||
expected_shape = torch.Size((1, 12, 30522))
|
||||
@@ -1143,9 +1127,7 @@ class ProphetNetModelIntegrationTest(unittest.TestCase):
|
||||
assert torch.allclose(encoder_outputs[:, :3, :3], expected_encoder_outputs_slice, atol=1e-4)
|
||||
|
||||
# decoder outputs
|
||||
decoder_outputs = model.prophetnet.decoder(
|
||||
decoder_prev_ids, encoder_hidden_states=encoder_outputs, return_dict=True
|
||||
)
|
||||
decoder_outputs = model.prophetnet.decoder(decoder_prev_ids, encoder_hidden_states=encoder_outputs)
|
||||
predicting_streams = decoder_outputs[1].view(1, model.config.ngram, 12, -1)
|
||||
predicting_streams_logits = model.lm_head(predicting_streams)
|
||||
next_first_stream_logits = predicting_streams_logits[:, 0]
|
||||
|
||||
@@ -174,7 +174,6 @@ class ReformerModelTester:
|
||||
attn_layers=self.attn_layers,
|
||||
pad_token_id=self.pad_token_id,
|
||||
hash_seed=self.hash_seed,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return (
|
||||
|
||||
@@ -103,7 +103,6 @@ class RobertaModelTester:
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
type_vocab_size=self.type_vocab_size,
|
||||
initializer_range=self.initializer_range,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
|
||||
@@ -131,7 +131,6 @@ if is_torch_available():
|
||||
post_attention_groups=self.post_attention_groups,
|
||||
intermediate_groups=self.intermediate_groups,
|
||||
output_groups=self.output_groups,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
|
||||
@@ -115,7 +115,6 @@ class T5ModelTester:
|
||||
bos_token_id=self.pad_token_id,
|
||||
pad_token_id=self.pad_token_id,
|
||||
decoder_start_token_id=self.decoder_start_token_id,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return (
|
||||
|
||||
@@ -121,7 +121,6 @@ class TFAlbertModelTester:
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
type_vocab_size=self.type_vocab_size,
|
||||
initializer_range=self.initializer_range,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
|
||||
@@ -182,7 +182,6 @@ class TFBartHeadTests(unittest.TestCase):
|
||||
eos_token_id=2,
|
||||
pad_token_id=1,
|
||||
bos_token_id=0,
|
||||
return_dict=True,
|
||||
decoder_start_token_id=2,
|
||||
)
|
||||
return config, input_ids, batch_size
|
||||
@@ -206,7 +205,6 @@ class TFBartHeadTests(unittest.TestCase):
|
||||
encoder_ffn_dim=32,
|
||||
decoder_ffn_dim=32,
|
||||
max_position_embeddings=48,
|
||||
return_dict=True,
|
||||
)
|
||||
lm_model = TFBartForConditionalGeneration(config)
|
||||
context = tf.fill((7, 2), 4)
|
||||
@@ -356,7 +354,7 @@ class FasterTFBartModelIntegrationTests(unittest.TestCase):
|
||||
padding="longest",
|
||||
truncation=True,
|
||||
)
|
||||
features = self.xsum_1_1_model.get_encoder()(**batch, return_dict=True).last_hidden_state
|
||||
features = self.xsum_1_1_model.get_encoder()(**batch).last_hidden_state
|
||||
import numpy as np
|
||||
|
||||
expected = np.array([[-0.0828, -0.0251, -0.0674], [0.1277, 0.3311, -0.0255], [0.2613, -0.0840, -0.2763]])
|
||||
|
||||
@@ -120,7 +120,6 @@ class TFBertModelTester:
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
type_vocab_size=self.type_vocab_size,
|
||||
initializer_range=self.initializer_range,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
|
||||
@@ -39,7 +39,7 @@ class TFCamembertModelIntegrationTest(unittest.TestCase):
|
||||
dtype=tf.int32,
|
||||
) # J'aime le camembert !"
|
||||
|
||||
output = model(input_ids, return_dict=True)["last_hidden_state"]
|
||||
output = model(input_ids)["last_hidden_state"]
|
||||
expected_shape = tf.TensorShape((1, 10, 768))
|
||||
self.assertEqual(output.shape, expected_shape)
|
||||
# compare the actual values for a slice.
|
||||
|
||||
@@ -284,7 +284,7 @@ class TFModelTesterMixin:
|
||||
if isinstance(after_outputs, tf.Tensor):
|
||||
out_1 = after_outputs.numpy()
|
||||
elif isinstance(after_outputs, dict):
|
||||
out_1 = after_outputs[list(after_outputs.keys())[0]]
|
||||
out_1 = after_outputs[list(after_outputs.keys())[0]].numpy()
|
||||
else:
|
||||
out_1 = after_outputs[0].numpy()
|
||||
out_2 = outputs[0].numpy()
|
||||
|
||||
@@ -94,7 +94,6 @@ class TFCTRLModelTester(object):
|
||||
n_ctx=self.max_position_embeddings,
|
||||
# type_vocab_size=self.type_vocab_size,
|
||||
# initializer_range=self.initializer_range,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
|
||||
|
||||
@@ -91,7 +91,6 @@ class TFDistilBertModelTester:
|
||||
attention_dropout=self.attention_probs_dropout_prob,
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
initializer_range=self.initializer_range,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
|
||||
@@ -97,7 +97,6 @@ class TFElectraModelTester:
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
type_vocab_size=self.type_vocab_size,
|
||||
initializer_range=self.initializer_range,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
|
||||
@@ -114,7 +114,6 @@ class TFFlaubertModelTester:
|
||||
summary_type=self.summary_type,
|
||||
use_proj=self.use_proj,
|
||||
bos_token_id=self.bos_token_id,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return (
|
||||
|
||||
@@ -137,7 +137,6 @@ class TFFunnelModelTester:
|
||||
activation_dropout=self.activation_dropout,
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
type_vocab_size=self.type_vocab_size,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return (
|
||||
|
||||
@@ -104,7 +104,6 @@ class TFGPT2ModelTester:
|
||||
# initializer_range=self.initializer_range
|
||||
bos_token_id=self.bos_token_id,
|
||||
eos_token_id=self.eos_token_id,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
|
||||
|
||||
@@ -594,7 +594,9 @@ class TFLongformerModelIntegrationTest(unittest.TestCase):
|
||||
# 'Hello world! ' repeated 1000 times
|
||||
input_ids = tf.convert_to_tensor([[0] + [20920, 232, 328, 1437] * 1000 + [2]], dtype=tf.dtypes.int32)
|
||||
|
||||
loss, prediction_scores = model(input_ids, labels=input_ids)
|
||||
output = model(input_ids, labels=input_ids)
|
||||
loss = output.loss
|
||||
prediction_scores = output.logits
|
||||
|
||||
expected_loss = tf.constant(0.0073798)
|
||||
expected_prediction_scores_sum = tf.constant(-610476600.0)
|
||||
|
||||
@@ -297,7 +297,6 @@ class TFLxmertModelTester(object):
|
||||
matched_label=matched_label,
|
||||
ans=ans,
|
||||
output_attentions=output_attentions,
|
||||
return_dict=True,
|
||||
)
|
||||
result = model(
|
||||
input_ids,
|
||||
@@ -352,7 +351,6 @@ class TFLxmertModelTester(object):
|
||||
matched_label=matched_label,
|
||||
ans=ans,
|
||||
output_attentions=not output_attentions,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
|
||||
@@ -695,7 +693,8 @@ class TFLxmertModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||
model = tf.keras.models.load_model(tmpdirname)
|
||||
outputs = model(class_inputs_dict)
|
||||
|
||||
language_hidden_states, vision_hidden_states = outputs[-2], outputs[-1]
|
||||
language_hidden_states = outputs["language_hidden_states"]
|
||||
vision_hidden_states = outputs["vision_hidden_states"]
|
||||
|
||||
self.assertEqual(len(language_hidden_states), self.model_tester.num_hidden_layers["language"] + 1)
|
||||
self.assertEqual(len(vision_hidden_states), self.model_tester.num_hidden_layers["vision"] + 1)
|
||||
@@ -731,11 +730,9 @@ class TFLxmertModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||
model = tf.keras.models.load_model(tmpdirname)
|
||||
outputs = model(class_inputs_dict)
|
||||
|
||||
language_attentions, vision_attentions, cross_encoder_attentions = (
|
||||
outputs[-3],
|
||||
outputs[-2],
|
||||
outputs[-1],
|
||||
)
|
||||
language_attentions = outputs["language_attentions"]
|
||||
vision_attentions = outputs["vision_attentions"]
|
||||
cross_encoder_attentions = outputs["cross_encoder_attentions"]
|
||||
|
||||
self.assertEqual(len(language_attentions), self.model_tester.num_hidden_layers["language"])
|
||||
self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers["vision"])
|
||||
|
||||
@@ -139,7 +139,6 @@ class TFMobileBertModelTest(TFModelTesterMixin, unittest.TestCase):
|
||||
type_vocab_size=self.type_vocab_size,
|
||||
initializer_range=self.initializer_range,
|
||||
embedding_size=self.embedding_size,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
|
||||
@@ -99,7 +99,6 @@ class TFOpenAIGPTModelTester:
|
||||
n_ctx=self.max_position_embeddings,
|
||||
# type_vocab_size=self.type_vocab_size,
|
||||
# initializer_range=self.initializer_range,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
|
||||
|
||||
@@ -97,7 +97,6 @@ class TFRobertaModelTester:
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
type_vocab_size=self.type_vocab_size,
|
||||
initializer_range=self.initializer_range,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
|
||||
|
||||
@@ -78,7 +78,6 @@ class TFT5ModelTester:
|
||||
bos_token_id=self.pad_token_id,
|
||||
pad_token_id=self.pad_token_id,
|
||||
decoder_start_token_id=self.pad_token_id,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return (config, input_ids, input_mask, token_labels)
|
||||
|
||||
@@ -77,7 +77,6 @@ class TFTransfoXLModelTester:
|
||||
div_val=self.div_val,
|
||||
n_layer=self.num_hidden_layers,
|
||||
eos_token_id=self.eos_token_id,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return (config, input_ids_1, input_ids_2, lm_labels)
|
||||
|
||||
@@ -114,7 +114,6 @@ class TFXLMModelTester:
|
||||
summary_type=self.summary_type,
|
||||
use_proj=self.use_proj,
|
||||
bos_token_id=self.bos_token_id,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return (
|
||||
|
||||
@@ -39,7 +39,7 @@ class TFFlaubertModelIntegrationTest(unittest.TestCase):
|
||||
"attention_mask": tf.convert_to_tensor([[1, 1, 1, 1, 1, 1]], dtype=tf.int32),
|
||||
}
|
||||
|
||||
output = model(features, return_dict=True)["last_hidden_state"]
|
||||
output = model(features)["last_hidden_state"]
|
||||
expected_shape = tf.TensorShape((1, 6, 768))
|
||||
self.assertEqual(output.shape, expected_shape)
|
||||
# compare the actual values for a slice.
|
||||
|
||||
@@ -111,7 +111,6 @@ class TFXLNetModelTester:
|
||||
bos_token_id=self.bos_token_id,
|
||||
pad_token_id=self.pad_token_id,
|
||||
eos_token_id=self.eos_token_id,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return (
|
||||
|
||||
@@ -78,7 +78,6 @@ class TransfoXLModelTester:
|
||||
div_val=self.div_val,
|
||||
n_layer=self.num_hidden_layers,
|
||||
eos_token_id=self.eos_token_id,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return (config, input_ids_1, input_ids_2, lm_labels)
|
||||
|
||||
@@ -116,7 +116,6 @@ class XLMModelTester:
|
||||
use_proj=self.use_proj,
|
||||
num_labels=self.num_labels,
|
||||
bos_token_id=self.bos_token_id,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return (
|
||||
|
||||
@@ -32,7 +32,7 @@ if is_torch_available():
|
||||
class XLMRobertaModelIntegrationTest(unittest.TestCase):
|
||||
@slow
|
||||
def test_xlm_roberta_base(self):
|
||||
model = XLMRobertaModel.from_pretrained("xlm-roberta-base", return_dict=True)
|
||||
model = XLMRobertaModel.from_pretrained("xlm-roberta-base")
|
||||
input_ids = torch.tensor([[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]])
|
||||
# The dog is cute and lives in the garden house
|
||||
|
||||
@@ -51,7 +51,7 @@ class XLMRobertaModelIntegrationTest(unittest.TestCase):
|
||||
|
||||
@slow
|
||||
def test_xlm_roberta_large(self):
|
||||
model = XLMRobertaModel.from_pretrained("xlm-roberta-large", return_dict=True)
|
||||
model = XLMRobertaModel.from_pretrained("xlm-roberta-large")
|
||||
input_ids = torch.tensor([[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]])
|
||||
# The dog is cute and lives in the garden house
|
||||
|
||||
|
||||
@@ -148,7 +148,6 @@ class XLNetModelTester:
|
||||
bos_token_id=self.bos_token_id,
|
||||
pad_token_id=self.pad_token_id,
|
||||
eos_token_id=self.eos_token_id,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
return (
|
||||
|
||||
Reference in New Issue
Block a user