Support for Falcon2-11B (#30771)
* remove unrelated changes * remove unrelated changes on phi and stable LM * add: Test for Falcon 10B * fix: formatting * fix: loading the falcon 10B in 8 bit precision using bitsanbytes. * fix: device placement * fix: broken tests. * fix: backwards compatibility for falcon 1B architecture. * chore: updated test. * chore: test_modeling_falcon.py to use the 11B model. * chore: minor edit * chore: formating. --------- Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com> Co-authored-by: ArthurZucker <arthur.zucker@gmail.com>
This commit is contained in:
committed by
GitHub
parent
f63d822242
commit
e52741f601
@@ -42,6 +42,9 @@ class FalconConfig(PretrainedConfig):
|
|||||||
Number of hidden layers in the Transformer decoder.
|
Number of hidden layers in the Transformer decoder.
|
||||||
num_attention_heads (`int`, *optional*, defaults to 71):
|
num_attention_heads (`int`, *optional*, defaults to 71):
|
||||||
Number of attention heads for each attention layer in the Transformer encoder.
|
Number of attention heads for each attention layer in the Transformer encoder.
|
||||||
|
num_ln_in_parallel_attn (`int`, *optional*):
|
||||||
|
Set to 2 if separate layer norms are to be used for the MLP and the attention output when using parallel
|
||||||
|
attention, otherwise, 1.
|
||||||
layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
|
layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
|
||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||||
@@ -115,6 +118,7 @@ class FalconConfig(PretrainedConfig):
|
|||||||
hidden_size=4544,
|
hidden_size=4544,
|
||||||
num_hidden_layers=32,
|
num_hidden_layers=32,
|
||||||
num_attention_heads=71,
|
num_attention_heads=71,
|
||||||
|
num_ln_in_parallel_attn=None,
|
||||||
layer_norm_epsilon=1e-5,
|
layer_norm_epsilon=1e-5,
|
||||||
initializer_range=0.02,
|
initializer_range=0.02,
|
||||||
use_cache=True,
|
use_cache=True,
|
||||||
@@ -154,6 +158,7 @@ class FalconConfig(PretrainedConfig):
|
|||||||
self.multi_query = multi_query # Ignored when new_decoder_architecture is True
|
self.multi_query = multi_query # Ignored when new_decoder_architecture is True
|
||||||
self.parallel_attn = parallel_attn
|
self.parallel_attn = parallel_attn
|
||||||
self.bias = bias
|
self.bias = bias
|
||||||
|
self.num_ln_in_parallel_attn = num_ln_in_parallel_attn
|
||||||
self.max_position_embeddings = max_position_embeddings
|
self.max_position_embeddings = max_position_embeddings
|
||||||
self.rope_theta = rope_theta
|
self.rope_theta = rope_theta
|
||||||
self.rope_scaling = rope_scaling
|
self.rope_scaling = rope_scaling
|
||||||
|
|||||||
@@ -767,15 +767,20 @@ class FalconDecoderLayer(nn.Module):
|
|||||||
self.hidden_dropout = config.hidden_dropout
|
self.hidden_dropout = config.hidden_dropout
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|
||||||
if config.new_decoder_architecture:
|
if config.num_ln_in_parallel_attn is None and config.new_decoder_architecture:
|
||||||
# The layer norm before self-attention
|
config.num_ln_in_parallel_attn = 2
|
||||||
self.ln_attn = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
|
|
||||||
# The layer norm before the MLP
|
if not config.parallel_attn:
|
||||||
self.ln_mlp = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
|
self.post_attention_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
|
||||||
else:
|
|
||||||
self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
|
self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
|
||||||
if not config.parallel_attn:
|
else:
|
||||||
self.post_attention_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
|
if config.num_ln_in_parallel_attn == 2:
|
||||||
|
# The layer norm before self-attention
|
||||||
|
self.ln_attn = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
|
||||||
|
# The layer norm before the MLP
|
||||||
|
self.ln_mlp = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
|
||||||
|
else:
|
||||||
|
self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
@@ -796,7 +801,7 @@ class FalconDecoderLayer(nn.Module):
|
|||||||
|
|
||||||
residual = hidden_states
|
residual = hidden_states
|
||||||
|
|
||||||
if self.config.new_decoder_architecture:
|
if self.config.new_decoder_architecture and self.config.num_ln_in_parallel_attn == 2:
|
||||||
attention_layernorm_out = self.ln_attn(hidden_states)
|
attention_layernorm_out = self.ln_attn(hidden_states)
|
||||||
mlp_layernorm_out = self.ln_mlp(hidden_states)
|
mlp_layernorm_out = self.ln_mlp(hidden_states)
|
||||||
else:
|
else:
|
||||||
@@ -826,6 +831,13 @@ class FalconDecoderLayer(nn.Module):
|
|||||||
)
|
)
|
||||||
mlp_layernorm_out = self.post_attention_layernorm(residual)
|
mlp_layernorm_out = self.post_attention_layernorm(residual)
|
||||||
|
|
||||||
|
if (
|
||||||
|
self.config.new_decoder_architecture
|
||||||
|
and self.config.parallel_attn
|
||||||
|
and self.config.num_ln_in_parallel_attn == 1
|
||||||
|
):
|
||||||
|
mlp_layernorm_out = attention_layernorm_out
|
||||||
|
|
||||||
outputs = attn_outputs[1:]
|
outputs = attn_outputs[1:]
|
||||||
|
|
||||||
# MLP.
|
# MLP.
|
||||||
|
|||||||
@@ -602,6 +602,25 @@ class FalconLanguageGenerationTest(unittest.TestCase):
|
|||||||
|
|
||||||
self.assertEqual(output_str, EXPECTED_OUTPUT)
|
self.assertEqual(output_str, EXPECTED_OUTPUT)
|
||||||
|
|
||||||
|
@slow
|
||||||
|
@require_bitsandbytes
|
||||||
|
def test_lm_generate_falcon_11b(self):
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-11B", padding_side="left")
|
||||||
|
model = FalconForCausalLM.from_pretrained(
|
||||||
|
"tiiuae/falcon-11B", device_map={"": torch_device}, load_in_8bit=True
|
||||||
|
)
|
||||||
|
model.eval()
|
||||||
|
inputs = tokenizer(
|
||||||
|
"Two roads diverged in a yellow wood,", return_tensors="pt", return_token_type_ids=False
|
||||||
|
).to(torch_device)
|
||||||
|
|
||||||
|
EXPECTED_OUTPUT = "Two roads diverged in a yellow wood,\nAnd sorry I could not travel both\n"
|
||||||
|
|
||||||
|
output_ids = model.generate(**inputs, do_sample=False, max_new_tokens=9)
|
||||||
|
output_str = tokenizer.batch_decode(output_ids)[0]
|
||||||
|
|
||||||
|
self.assertEqual(output_str, EXPECTED_OUTPUT)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
def test_lm_generation_big_models(self):
|
def test_lm_generation_big_models(self):
|
||||||
# The big models are way too big for the CI, so we use tiny random models that resemble their
|
# The big models are way too big for the CI, so we use tiny random models that resemble their
|
||||||
@@ -647,7 +666,7 @@ class FalconLanguageGenerationTest(unittest.TestCase):
|
|||||||
tokenizer.pad_token = tokenizer.eos_token
|
tokenizer.pad_token = tokenizer.eos_token
|
||||||
model = AutoModelForCausalLM.from_pretrained(
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
"tiiuae/falcon-7b",
|
"tiiuae/falcon-7b",
|
||||||
device_map="auto",
|
device_map={"": torch_device},
|
||||||
load_in_4bit=True,
|
load_in_4bit=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user