🧹 Remove deprecated RotaryEmbedding parts in the Attention layers (#34858)

* update * style * fix missing args * remove last trace of old rope classes * remove deprecated copied from * fix copies * trigger CIs * post rebase clean-up * reverse mistral * cleanup after dropping commits * Add comment
2024-12-11 11:16:52 +01:00
parent 9094b87dd4
commit d363e71d0e
31 changed files with 151 additions and 748 deletions
--- a/tests/models/gpt_neox/test_modeling_gpt_neox.py
+++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py
@@ -37,11 +37,7 @@ if is_torch_available():
        GPTNeoXForTokenClassification,
        GPTNeoXModel,
    )
-    from transformers.models.gpt_neox.modeling_gpt_neox import (
-        GPTNeoXDynamicNTKScalingRotaryEmbedding,
-        GPTNeoXLinearScalingRotaryEmbedding,
-        GPTNeoXRotaryEmbedding,
-    )
+    from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXRotaryEmbedding


 class GPTNeoXModelTester:
@@ -400,11 +396,12 @@ class GPTNeoXModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi

        # Sanity check linear RoPE scaling
        # New position "x" should match original position with index "x/scaling_factor"
-        linear_scaling_rope = GPTNeoXLinearScalingRotaryEmbedding(
+        linear_scaling_rope = GPTNeoXRotaryEmbedding(
            head_dim,
            max_position_embeddings=config.max_position_embeddings,
            base=config.rotary_emb_base,
            scaling_factor=scaling_factor,
+            rope_type="linear",
        ).to(torch_device)
        linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short)
        linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long)
@@ -418,11 +415,12 @@ class GPTNeoXModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
        # Sanity check Dynamic NTK RoPE scaling
        # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase
        # with scaling_factor (or that `inv_freq` decreases)
-        ntk_scaling_rope = GPTNeoXDynamicNTKScalingRotaryEmbedding(
+        ntk_scaling_rope = GPTNeoXRotaryEmbedding(
            head_dim,
            max_position_embeddings=config.max_position_embeddings,
            base=config.rotary_emb_base,
            scaling_factor=scaling_factor,
+            rope_type="dynamic",
        ).to(torch_device)
        ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short)
        ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long)