Llama/GPTNeoX: add RoPE scaling (#24653)

* add rope_scaling

* tmp commit

* add gptneox

* add tests

* GPTNeoX can now handle long inputs, so the pipeline test was wrong

* Update src/transformers/models/open_llama/configuration_open_llama.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* remove ntk

* remove redundant validation

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
This commit is contained in:
Joao Gante
2023-07-13 16:47:30 +01:00
committed by GitHub
parent 9342c8fb82
commit 34d9409427
11 changed files with 484 additions and 67 deletions

View File

@@ -17,7 +17,9 @@
import unittest
from transformers import AutoTokenizer, GPTNeoXConfig, is_torch_available
from parameterized import parameterized
from transformers import AutoTokenizer, GPTNeoXConfig, is_torch_available, set_seed
from transformers.testing_utils import require_torch, slow, torch_device
from ...generation.test_utils import GenerationTesterMixin
@@ -49,7 +51,7 @@ class GPTNeoXModelTester:
use_token_type_ids=True,
use_labels=True,
vocab_size=99,
hidden_size=32,
hidden_size=64,
num_hidden_layers=5,
num_attention_heads=4,
intermediate_size=37,
@@ -298,6 +300,37 @@ class GPTNeoXModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
def test_feed_forward_chunking(self):
pass
@parameterized.expand([("linear",), ("dynamic",)])
def test_model_rope_scaling(self, scaling_type):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
short_input = ids_tensor([1, 10], config.vocab_size)
long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
set_seed(42) # Fixed seed at init time so the two models get the same random weights
original_model = GPTNeoXModel(config)
original_model.to(torch_device)
original_model.eval()
original_short_output = original_model(short_input).last_hidden_state
original_long_output = original_model(long_input).last_hidden_state
set_seed(42) # Fixed seed at init time so the two models get the same random weights
config.rope_scaling = {"type": scaling_type, "factor": 10.0}
scaled_model = GPTNeoXModel(config)
scaled_model.to(torch_device)
scaled_model.eval()
scaled_short_output = scaled_model(short_input).last_hidden_state
scaled_long_output = scaled_model(long_input).last_hidden_state
# Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
# maximum sequence length, so the outputs for the short input should match.
if scaling_type == "dynamic":
self.assertTrue(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
else:
self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
# The output should be different for long inputs
self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
@require_torch
class GPTNeoXLanguageGenerationTest(unittest.TestCase):

View File

@@ -17,7 +17,9 @@
import unittest
from transformers import LlamaConfig, is_torch_available
from parameterized import parameterized
from transformers import LlamaConfig, is_torch_available, set_seed
from transformers.testing_utils import require_torch, torch_device
from ...generation.test_utils import GenerationTesterMixin
@@ -332,3 +334,34 @@ class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
@unittest.skip("LLaMA buffers include complex numbers, which breaks this test")
def test_save_load_fast_init_from_base(self):
pass
@parameterized.expand([("linear",), ("dynamic",)])
def test_model_rope_scaling(self, scaling_type):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
short_input = ids_tensor([1, 10], config.vocab_size)
long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
set_seed(42) # Fixed seed at init time so the two models get the same random weights
original_model = LlamaModel(config)
original_model.to(torch_device)
original_model.eval()
original_short_output = original_model(short_input).last_hidden_state
original_long_output = original_model(long_input).last_hidden_state
set_seed(42) # Fixed seed at init time so the two models get the same random weights
config.rope_scaling = {"type": scaling_type, "factor": 10.0}
scaled_model = LlamaModel(config)
scaled_model.to(torch_device)
scaled_model.eval()
scaled_short_output = scaled_model(short_input).last_hidden_state
scaled_long_output = scaled_model(long_input).last_hidden_state
# Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
# maximum sequence length, so the outputs for the short input should match.
if scaling_type == "dynamic":
self.assertTrue(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
else:
self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
# The output should be different for long inputs
self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))

View File

@@ -17,7 +17,9 @@
import unittest
from transformers import OpenLlamaConfig, is_torch_available
from parameterized import parameterized
from transformers import OpenLlamaConfig, is_torch_available, set_seed
from transformers.testing_utils import require_torch, torch_device
from ...generation.test_utils import GenerationTesterMixin
@@ -335,3 +337,34 @@ class OpenLlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
@unittest.skip("Open-Llama buffers include complex numbers, which breaks this test")
def test_save_load_fast_init_from_base(self):
pass
@parameterized.expand([("linear",), ("dynamic",)])
def test_model_rope_scaling(self, scaling_type):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
short_input = ids_tensor([1, 10], config.vocab_size)
long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size)
set_seed(42) # Fixed seed at init time so the two models get the same random weights
original_model = OpenLlamaModel(config)
original_model.to(torch_device)
original_model.eval()
original_short_output = original_model(short_input).last_hidden_state
original_long_output = original_model(long_input).last_hidden_state
set_seed(42) # Fixed seed at init time so the two models get the same random weights
config.rope_scaling = {"type": scaling_type, "factor": 10.0}
scaled_model = OpenLlamaModel(config)
scaled_model.to(torch_device)
scaled_model.eval()
scaled_short_output = scaled_model(short_input).last_hidden_state
scaled_long_output = scaled_model(long_input).last_hidden_state
# Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original
# maximum sequence length, so the outputs for the short input should match.
if scaling_type == "dynamic":
self.assertTrue(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
else:
self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5))
# The output should be different for long inputs
self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))

View File

@@ -240,7 +240,7 @@ class TextGenerationPipelineTests(unittest.TestCase):
# We don't care about infinite range models.
# They already work.
# Skip this test for XGLM, since it uses sinusoidal positional embeddings which are resized on-the-fly.
EXTRA_MODELS_CAN_HANDLE_LONG_INPUTS = ["RwkvForCausalLM", "XGLMForCausalLM"]
EXTRA_MODELS_CAN_HANDLE_LONG_INPUTS = ["RwkvForCausalLM", "XGLMForCausalLM", "GPTNeoXForCausalLM"]
if (
tokenizer.model_max_length < 10000
and text_generator.model.__class__.__name__ not in EXTRA_MODELS_CAN_HANDLE_LONG_INPUTS