Bart: new cache format (#35314)
* bart compile * add mbart * some more models touched by fix-copies * more * more models * even more models * fix copies * fix tests * fix copies * fix * biogpt accepts position ids now (breaking?) * fix failing non-slow tests * fix some tests * should not be removed * small update * Update src/transformers/models/bart/modeling_bart.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * update for last `main` * fix copies * clone `update_causal_mask` from llama * tmp * fixup * why? how? * fix bart tests * dont skip test * address comments * fix tests * fix * fixup and delete the file --------- Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
This commit is contained in:
committed by
GitHub
parent
3ab47b6ce3
commit
01ad9f4b49
@@ -87,7 +87,7 @@ class BartModelTester:
|
||||
hidden_act="gelu",
|
||||
hidden_dropout_prob=0.1,
|
||||
attention_probs_dropout_prob=0.1,
|
||||
max_position_embeddings=20,
|
||||
max_position_embeddings=50,
|
||||
eos_token_id=2,
|
||||
pad_token_id=1,
|
||||
bos_token_id=0,
|
||||
@@ -1164,8 +1164,7 @@ class BartModelIntegrationTests(unittest.TestCase):
|
||||
[FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY],
|
||||
max_length=1024,
|
||||
padding="max_length",
|
||||
truncation_strategy="only_first",
|
||||
truncation=True,
|
||||
truncation="only_first",
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
@@ -1301,7 +1300,7 @@ class BartStandaloneDecoderModelTester:
|
||||
decoder_layers=2,
|
||||
encoder_attention_heads=4,
|
||||
decoder_attention_heads=4,
|
||||
max_position_embeddings=30,
|
||||
max_position_embeddings=50,
|
||||
is_encoder_decoder=False,
|
||||
pad_token_id=0,
|
||||
bos_token_id=1,
|
||||
@@ -1365,6 +1364,7 @@ class BartStandaloneDecoderModelTester:
|
||||
decoder_start_token_id=self.decoder_start_token_id,
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
is_encoder_decoder=self.is_encoder_decoder,
|
||||
forced_eos_token_id=None,
|
||||
)
|
||||
|
||||
return (
|
||||
@@ -1465,9 +1465,9 @@ class BartStandaloneDecoderModelTester:
|
||||
|
||||
# get two different outputs
|
||||
output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
|
||||
output_from_past = model(next_tokens, attention_mask=attn_mask, past_key_values=past_key_values)[
|
||||
"last_hidden_state"
|
||||
]
|
||||
output_from_past = model(
|
||||
next_tokens, attention_mask=attn_mask, past_key_values=past_key_values, use_cache=True
|
||||
)["last_hidden_state"]
|
||||
|
||||
# select random slice
|
||||
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
|
||||
|
||||
@@ -611,7 +611,7 @@ class BigBirdPegasusStandaloneDecoderModelTester:
|
||||
decoder_layers=2,
|
||||
encoder_attention_heads=4,
|
||||
decoder_attention_heads=4,
|
||||
max_position_embeddings=30,
|
||||
max_position_embeddings=50,
|
||||
is_encoder_decoder=False,
|
||||
pad_token_id=0,
|
||||
bos_token_id=1,
|
||||
@@ -767,7 +767,7 @@ class BigBirdPegasusStandaloneDecoderModelTester:
|
||||
|
||||
# get two different outputs
|
||||
output_from_no_past = model(next_input_ids)["last_hidden_state"]
|
||||
output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
|
||||
output_from_past = model(next_tokens, past_key_values=past_key_values, use_cache=True)["last_hidden_state"]
|
||||
|
||||
# select random slice
|
||||
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
|
||||
|
||||
@@ -320,6 +320,7 @@ class BioGptModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
|
||||
# Define PAD Token = EOS Token = 50256
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
model.config.pad_token_id = model.config.eos_token_id
|
||||
model.generation_config.pad_token_id = model.generation_config.eos_token_id
|
||||
|
||||
# use different length sentences to test batching
|
||||
sentences = [
|
||||
@@ -333,10 +334,11 @@ class BioGptModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
|
||||
outputs = model.generate(
|
||||
input_ids=input_ids,
|
||||
attention_mask=inputs["attention_mask"].to(torch_device),
|
||||
max_new_tokens=10,
|
||||
)
|
||||
|
||||
inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
|
||||
output_non_padded = model.generate(input_ids=inputs_non_padded)
|
||||
output_non_padded = model.generate(input_ids=inputs_non_padded, max_new_tokens=10)
|
||||
|
||||
num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().item()
|
||||
inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
|
||||
|
||||
@@ -356,7 +356,7 @@ class BlenderbotStandaloneDecoderModelTester:
|
||||
decoder_layers=2,
|
||||
encoder_attention_heads=4,
|
||||
decoder_attention_heads=4,
|
||||
max_position_embeddings=30,
|
||||
max_position_embeddings=50,
|
||||
is_encoder_decoder=False,
|
||||
pad_token_id=0,
|
||||
bos_token_id=1,
|
||||
@@ -500,9 +500,9 @@ class BlenderbotStandaloneDecoderModelTester:
|
||||
|
||||
# get two different outputs
|
||||
output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
|
||||
output_from_past = model(next_tokens, past_key_values=past_key_values, attention_mask=attn_mask)[
|
||||
"last_hidden_state"
|
||||
]
|
||||
output_from_past = model(
|
||||
next_tokens, past_key_values=past_key_values, attention_mask=attn_mask, use_cache=True
|
||||
)["last_hidden_state"]
|
||||
|
||||
# select random slice
|
||||
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
|
||||
|
||||
@@ -366,7 +366,7 @@ class BlenderbotSmallStandaloneDecoderModelTester:
|
||||
decoder_layers=2,
|
||||
encoder_attention_heads=4,
|
||||
decoder_attention_heads=4,
|
||||
max_position_embeddings=30,
|
||||
max_position_embeddings=50,
|
||||
is_encoder_decoder=False,
|
||||
pad_token_id=0,
|
||||
bos_token_id=1,
|
||||
@@ -509,9 +509,9 @@ class BlenderbotSmallStandaloneDecoderModelTester:
|
||||
|
||||
# get two different outputs
|
||||
output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
|
||||
output_from_past = model(next_tokens, past_key_values=past_key_values, attention_mask=attn_mask)[
|
||||
"last_hidden_state"
|
||||
]
|
||||
output_from_past = model(
|
||||
next_tokens, past_key_values=past_key_values, attention_mask=attn_mask, use_cache=True
|
||||
)["last_hidden_state"]
|
||||
|
||||
# select random slice
|
||||
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
|
||||
|
||||
@@ -170,8 +170,9 @@ class InformerModelTester:
|
||||
|
||||
embed_positions = InformerSinusoidalPositionalEmbedding(
|
||||
config.context_length + config.prediction_length, config.d_model
|
||||
).to(torch_device)
|
||||
)
|
||||
embed_positions._init_weight()
|
||||
embed_positions = embed_positions.to(torch_device)
|
||||
self.parent.assertTrue(torch.equal(model.encoder.embed_positions.weight, embed_positions.weight))
|
||||
self.parent.assertTrue(torch.equal(model.decoder.embed_positions.weight, embed_positions.weight))
|
||||
|
||||
|
||||
@@ -82,7 +82,7 @@ class M2M100ModelTester:
|
||||
attention_probs_dropout_prob=0.1,
|
||||
encoder_layerdrop=0.0,
|
||||
decoder_layerdrop=0.0,
|
||||
max_position_embeddings=20,
|
||||
max_position_embeddings=50,
|
||||
eos_token_id=2,
|
||||
pad_token_id=1,
|
||||
bos_token_id=0,
|
||||
@@ -426,7 +426,7 @@ class M2M100ModelIntegrationTests(unittest.TestCase):
|
||||
Overwriting the common test as the test is flaky on tiny models
|
||||
"""
|
||||
model = M2M100ForConditionalGeneration.from_pretrained(
|
||||
"facebook/m2m100_418M", attn_implementation="flash_attention_2"
|
||||
"facebook/m2m100_418M", torch_dtype=torch.float16, attn_implementation="flash_attention_2"
|
||||
).to(torch_device)
|
||||
|
||||
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="fr", tgt_lang="en")
|
||||
|
||||
@@ -99,7 +99,7 @@ class MarianModelTester:
|
||||
hidden_act="gelu",
|
||||
hidden_dropout_prob=0.1,
|
||||
attention_probs_dropout_prob=0.1,
|
||||
max_position_embeddings=20,
|
||||
max_position_embeddings=100,
|
||||
eos_token_id=2,
|
||||
pad_token_id=1,
|
||||
bos_token_id=0,
|
||||
@@ -653,7 +653,7 @@ class MarianStandaloneDecoderModelTester:
|
||||
decoder_layers=2,
|
||||
encoder_attention_heads=4,
|
||||
decoder_attention_heads=4,
|
||||
max_position_embeddings=30,
|
||||
max_position_embeddings=100,
|
||||
is_encoder_decoder=False,
|
||||
pad_token_id=0,
|
||||
bos_token_id=1,
|
||||
@@ -796,9 +796,9 @@ class MarianStandaloneDecoderModelTester:
|
||||
|
||||
# get two different outputs
|
||||
output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
|
||||
output_from_past = model(next_tokens, attention_mask=attn_mask, past_key_values=past_key_values)[
|
||||
"last_hidden_state"
|
||||
]
|
||||
output_from_past = model(
|
||||
next_tokens, attention_mask=attn_mask, past_key_values=past_key_values, use_cache=True
|
||||
)["last_hidden_state"]
|
||||
|
||||
# select random slice
|
||||
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
|
||||
|
||||
@@ -538,7 +538,7 @@ class MBartStandaloneDecoderModelTester:
|
||||
decoder_layers=2,
|
||||
encoder_attention_heads=4,
|
||||
decoder_attention_heads=4,
|
||||
max_position_embeddings=30,
|
||||
max_position_embeddings=50,
|
||||
is_encoder_decoder=False,
|
||||
pad_token_id=0,
|
||||
bos_token_id=1,
|
||||
@@ -681,9 +681,9 @@ class MBartStandaloneDecoderModelTester:
|
||||
|
||||
# get two different outputs
|
||||
output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
|
||||
output_from_past = model(next_tokens, attention_mask=attn_mask, past_key_values=past_key_values)[
|
||||
"last_hidden_state"
|
||||
]
|
||||
output_from_past = model(
|
||||
next_tokens, attention_mask=attn_mask, past_key_values=past_key_values, use_cache=True
|
||||
)["last_hidden_state"]
|
||||
|
||||
# select random slice
|
||||
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
|
||||
|
||||
@@ -542,9 +542,9 @@ class PegasusStandaloneDecoderModelTester:
|
||||
|
||||
# get two different outputs
|
||||
output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
|
||||
output_from_past = model(next_tokens, attention_mask=attn_mask, past_key_values=past_key_values)[
|
||||
"last_hidden_state"
|
||||
]
|
||||
output_from_past = model(
|
||||
next_tokens, attention_mask=attn_mask, past_key_values=past_key_values, use_cache=True
|
||||
)["last_hidden_state"]
|
||||
|
||||
# select random slice
|
||||
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
|
||||
|
||||
@@ -78,7 +78,7 @@ class PegasusXModelTester:
|
||||
hidden_act="gelu",
|
||||
hidden_dropout_prob=0.1,
|
||||
attention_probs_dropout_prob=0.1,
|
||||
max_position_embeddings=20,
|
||||
max_position_embeddings=50,
|
||||
eos_token_id=2,
|
||||
pad_token_id=1,
|
||||
bos_token_id=0,
|
||||
@@ -676,7 +676,7 @@ class PegasusXStandaloneDecoderModelTester:
|
||||
decoder_layers=2,
|
||||
encoder_attention_heads=4,
|
||||
decoder_attention_heads=4,
|
||||
max_position_embeddings=30,
|
||||
max_position_embeddings=50,
|
||||
is_encoder_decoder=False,
|
||||
pad_token_id=0,
|
||||
bos_token_id=1,
|
||||
@@ -819,7 +819,7 @@ class PegasusXStandaloneDecoderModelTester:
|
||||
|
||||
# get two different outputs
|
||||
output_from_no_past = model(next_input_ids)["last_hidden_state"]
|
||||
output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
|
||||
output_from_past = model(next_tokens, past_key_values=past_key_values, use_cache=True)["last_hidden_state"]
|
||||
|
||||
# select random slice
|
||||
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
|
||||
|
||||
@@ -496,7 +496,7 @@ class PLBartStandaloneDecoderModelTester:
|
||||
decoder_layers=2,
|
||||
encoder_attention_heads=4,
|
||||
decoder_attention_heads=4,
|
||||
max_position_embeddings=30,
|
||||
max_position_embeddings=50,
|
||||
is_encoder_decoder=False,
|
||||
pad_token_id=0,
|
||||
bos_token_id=1,
|
||||
@@ -634,9 +634,9 @@ class PLBartStandaloneDecoderModelTester:
|
||||
|
||||
# get two different outputs
|
||||
output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
|
||||
output_from_past = model(next_tokens, attention_mask=attn_mask, past_key_values=past_key_values)[
|
||||
"last_hidden_state"
|
||||
]
|
||||
output_from_past = model(
|
||||
next_tokens, attention_mask=attn_mask, past_key_values=past_key_values, use_cache=True
|
||||
)["last_hidden_state"]
|
||||
|
||||
# select random slice
|
||||
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
|
||||
|
||||
Reference in New Issue
Block a user