Cache: init empty cache when use_cache (#34274)

* fix * fix tests * fix copies * add docs * Revert "add docs" This reverts commit 32d35634f12ba02781d2ebdee0c8dcfbe992a7b9. * qwen move deltas * mllama can potentiall fullgraph compile * enable mllama compile and fix tests * remove mllama fixes
2024-11-25 10:11:33 +01:00
parent 1339a14dca
commit c1a8520419
7 changed files with 57 additions and 64 deletions
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1531,6 +1531,14 @@ class GenerationTesterMixin:
            embed_dim = getattr(text_config, "d_model", text_config.hidden_size)
            per_head_embed_dim = embed_dim // num_attention_heads

+            # some models have diffent num-head for query vs key/value so we need to assign correct value
+            # BUT only after `per_head_embed_dim` is set
+            num_attention_heads = (
+                text_config.num_key_value_heads
+                if getattr(text_config, "num_key_value_heads", None) is not None
+                else num_attention_heads
+            )
+
            past_kv = outputs["past_key_values"]
            self.assertEqual(len(past_kv), num_hidden_layers)

--- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
@@ -333,6 +333,10 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
    def test_generate_from_inputs_embeds_with_static_cache(self):
        pass

+    @unittest.skip(reason="Can't compile fullgraph due to dynamic control flow in `prepare_inputs_for_generate`")
+    def test_generate_compile_fullgraph(self):
+        pass
+

@require_torch
 class Qwen2VLIntegrationTest(unittest.TestCase):
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -2343,7 +2343,8 @@ class ModelTesterMixin:
                            recursive_check(tuple_iterable_value, dict_iterable_value)
                    elif tuple_object is None:
                        return
-                    else:
+                    # model might return non-tensors objects (e.g. Cache class)
+                    elif isinstance(tuple_object, torch.Tensor):
                        self.assertTrue(
                            torch.allclose(
                                set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5