Bart: new cache format (#35314)

* bart compile

* add mbart

* some more models touched by fix-copies

* more

* more models

* even more models

* fix copies

* fix tests

* fix copies

* fix

* biogpt accepts position ids now (breaking?)

* fix failing non-slow tests

* fix some tests

* should not be removed

* small update

* Update src/transformers/models/bart/modeling_bart.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* update for last `main`

* fix copies

* clone `update_causal_mask` from llama

* tmp

* fixup

* why? how?

* fix bart tests

* dont skip test

* address comments

* fix tests

* fix

* fixup and delete the file

---------

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
This commit is contained in:
Raushan Turganbay
2025-05-16 13:26:54 +02:00
committed by GitHub
parent 3ab47b6ce3
commit 01ad9f4b49
46 changed files with 3904 additions and 1995 deletions

View File

@@ -87,7 +87,7 @@ class BartModelTester:
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=20,
max_position_embeddings=50,
eos_token_id=2,
pad_token_id=1,
bos_token_id=0,
@@ -1164,8 +1164,7 @@ class BartModelIntegrationTests(unittest.TestCase):
[FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY],
max_length=1024,
padding="max_length",
truncation_strategy="only_first",
truncation=True,
truncation="only_first",
return_tensors="pt",
)
@@ -1301,7 +1300,7 @@ class BartStandaloneDecoderModelTester:
decoder_layers=2,
encoder_attention_heads=4,
decoder_attention_heads=4,
max_position_embeddings=30,
max_position_embeddings=50,
is_encoder_decoder=False,
pad_token_id=0,
bos_token_id=1,
@@ -1365,6 +1364,7 @@ class BartStandaloneDecoderModelTester:
decoder_start_token_id=self.decoder_start_token_id,
max_position_embeddings=self.max_position_embeddings,
is_encoder_decoder=self.is_encoder_decoder,
forced_eos_token_id=None,
)
return (
@@ -1465,9 +1465,9 @@ class BartStandaloneDecoderModelTester:
# get two different outputs
output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
output_from_past = model(next_tokens, attention_mask=attn_mask, past_key_values=past_key_values)[
"last_hidden_state"
]
output_from_past = model(
next_tokens, attention_mask=attn_mask, past_key_values=past_key_values, use_cache=True
)["last_hidden_state"]
# select random slice
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()