[cache] make all classes cache compatible finally (#38635)
* dump * push other models * fix simple greedy generation * xmod * add fmst and clean up some mentions of old cache format * gpt-bigcode now follows standards * delete tuple cache reference in generation * fix some models * fix some models * fix mambas and support cache in tapas * fix some more tests * fix copies * delete `_reorder_cache` * another fix copies * fix typos and delete unnecessary test * fix rag generate, needs special cache reordering * fix tapas and superglue * reformer create special cache * recurrent gemma `reorder_cache` was a no-op, delete * fix-copies * fix blio and musicgen pipeline tests * fix reformer * fix reformer, again... * delete `_supports_cache_class` * delete `supports_quantized_cache` * fix failing tests * fix copies * some minor clean up * style * style * fix copies * fix tests * fix copies * create causal mask now needs positions? * fixc copies * style * Update tests/test_modeling_common.py Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com> * clean-up of non-generative model after merging main * check `is_decoder` for cache * delete transpose for scores * remove tuple cache from docs everywhere * fix tests * fix copies * fix copies once more * properly deprecate `encoder_attention_mask` in Bert-like models * import `deprecate_kwarg` where needed * fix copies again * fix copies * delete `nex_decoder_cache` * fix copies asks to update for PLM * fix copies * rebasing had a few new models, fix them and merge asap! * fix copies once more * fix slow tests * fix tests and updare PLM checkpoint * add read token and revert accidentally removed line * oh com -on, style * just skip it, read token has no access to PLM yet --------- Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
This commit is contained in:
committed by
GitHub
parent
6cb43defd0
commit
c8524aeb07
@@ -737,9 +737,11 @@ class BertModelIntegrationTest(unittest.TestCase):
|
||||
torch.allclose(res_eager.last_hidden_state, res_sdpa.last_hidden_state, atol=1e-5, rtol=1e-4)
|
||||
)
|
||||
|
||||
# Case where query length != kv_length.
|
||||
res_eager = model(**inp, past_key_values=pkv)
|
||||
res_sdpa = model_sdpa(**inp, past_key_values=pkv)
|
||||
# Case where query length != kv_length. Note that model needs to be a decoder so we can use cache
|
||||
model.config.is_decoder = True
|
||||
model_sdpa.config.is_decoder = True
|
||||
res_eager = model(**inp, past_key_values=pkv, use_cache=True)
|
||||
res_sdpa = model_sdpa(**inp, past_key_values=pkv, use_cache=True)
|
||||
self.assertTrue(
|
||||
torch.allclose(res_eager.last_hidden_state, res_sdpa.last_hidden_state, atol=1e-5, rtol=1e-4)
|
||||
)
|
||||
|
||||
@@ -284,6 +284,7 @@ class BigBirdModelTester:
|
||||
attention_mask=next_attention_mask,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
use_cache=False,
|
||||
output_hidden_states=True,
|
||||
)["hidden_states"][0]
|
||||
output_from_past = model(
|
||||
@@ -292,6 +293,7 @@ class BigBirdModelTester:
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
encoder_attention_mask=encoder_attention_mask,
|
||||
past_key_values=past_key_values,
|
||||
use_cache=True,
|
||||
output_hidden_states=True,
|
||||
)["hidden_states"][0]
|
||||
|
||||
|
||||
@@ -34,6 +34,7 @@ from transformers.testing_utils import (
|
||||
Expectations,
|
||||
cleanup,
|
||||
require_bitsandbytes,
|
||||
require_optimum_quanto,
|
||||
require_read_token,
|
||||
require_torch,
|
||||
require_torch_accelerator,
|
||||
@@ -344,6 +345,12 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester
|
||||
|
||||
self.assertListEqual([layer_attention.shape for layer_attention in iter_attentions], expected_shapes)
|
||||
|
||||
@require_optimum_quanto
|
||||
@pytest.mark.generate
|
||||
@unittest.skip("Mllama is actually an encoder decoder cache and thus can't supports quant cache")
|
||||
def test_generate_with_quant_cache(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("For some unknown reasons the tests fails in CrossAttention layer when doing torch.sdpa(). ")
|
||||
def test_sdpa_can_compile_dynamic(self):
|
||||
pass
|
||||
|
||||
@@ -770,9 +770,9 @@ class MvpStandaloneDecoderModelTester:
|
||||
|
||||
# get two different outputs
|
||||
output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
|
||||
output_from_past = model(next_tokens, attention_mask=attn_mask, past_key_values=past_key_values)[
|
||||
"last_hidden_state"
|
||||
]
|
||||
output_from_past = model(
|
||||
next_tokens, attention_mask=attn_mask, past_key_values=past_key_values, use_cache=True
|
||||
)["last_hidden_state"]
|
||||
|
||||
# select random slice
|
||||
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
|
||||
|
||||
@@ -21,7 +21,7 @@ from transformers import (
|
||||
AutoTokenizer,
|
||||
PerceptionLMProcessor,
|
||||
)
|
||||
from transformers.testing_utils import require_vision
|
||||
from transformers.testing_utils import require_read_token, require_vision
|
||||
from transformers.utils import is_torch_available, is_vision_available
|
||||
|
||||
from ...test_processing_common import ProcessorTesterMixin
|
||||
@@ -34,11 +34,12 @@ if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
# TEST_MODEL_PATH = "facebook/Perception-LM-1B"
|
||||
TEST_MODEL_PATH = "shumingh/plm_1b_hf" # should be replaced by the above once checkpoints are merged
|
||||
TEST_MODEL_PATH = "facebook/Perception-LM-1B"
|
||||
|
||||
|
||||
@require_vision
|
||||
@require_read_token
|
||||
@unittest.skip("Fequires read token and we didn't requests access yet. FIXME @ydshieh when you are back :)")
|
||||
class PerceptionLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
processor_class = PerceptionLMProcessor
|
||||
|
||||
|
||||
@@ -737,7 +737,7 @@ class ProphetNetStandaloneDecoderModelTester:
|
||||
|
||||
# get two different outputs
|
||||
output_from_no_past = model(next_input_ids)["last_hidden_state"]
|
||||
output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
|
||||
output_from_past = model(next_tokens, past_key_values=past_key_values, use_cache=True)["last_hidden_state"]
|
||||
|
||||
# select random slice
|
||||
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
|
||||
|
||||
@@ -354,9 +354,9 @@ class SpeechT5ForSpeechToTextTester:
|
||||
next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
|
||||
|
||||
output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
|
||||
output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[
|
||||
"last_hidden_state"
|
||||
]
|
||||
output_from_past = model(
|
||||
next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values, use_cache=True
|
||||
)["last_hidden_state"]
|
||||
|
||||
# select random slice
|
||||
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
|
||||
|
||||
Reference in New Issue
Block a user