From c57eafdaa119eecae8557be4c626629bc1adc0fd Mon Sep 17 00:00:00 2001 From: farrosalferro <127369839+farrosalferro@users.noreply.github.com> Date: Thu, 21 Nov 2024 19:37:34 +0900 Subject: [PATCH] Add Nemotron GGUF Loading Support (#34725) * Add Nemotron GGUF Loading Support * fix the Nemotron architecture assignation --------- Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> --- docs/source/en/gguf.md | 1 + src/transformers/integrations/ggml.py | 27 ++++++++++++++++++ tests/quantization/ggml/test_ggml.py | 40 +++++++++++++++++++++++++++ 3 files changed, 68 insertions(+) diff --git a/docs/source/en/gguf.md b/docs/source/en/gguf.md index 2da721b289..b1ed1f0d49 100644 --- a/docs/source/en/gguf.md +++ b/docs/source/en/gguf.md @@ -87,6 +87,7 @@ For now the supported model architectures are the architectures that have been v - Starcoder2 - T5 - Mamba +- Nemotron ## Example usage diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index f4545f2698..57f0af5667 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -248,6 +248,20 @@ GGUF_TENSOR_MAPPING = { "output_norm": "backbone.norm_f", "output.weight": "lm_head.weight", }, + "nemotron": { + "token_embd": "model.embed_tokens", + "blk": "model.layers", + "ffn_up": "mlp.up_proj", + "ffn_down": "mlp.down_proj", + "ffn_norm": "post_attention_layernorm", + "attn_norm": "input_layernorm", + "attn_q": "self_attn.q_proj", + "attn_v": "self_attn.v_proj", + "attn_k": "self_attn.k_proj", + "attn_output": "self_attn.o_proj", + "output.weight": "lm_head.weight", + "output_norm": "model.norm", + }, } @@ -397,6 +411,18 @@ GGUF_CONFIG_MAPPING = { "ssm.time_step_rank": "time_step_rank", "ssm.inner_size": "intermediate_size", }, + "nemotron": { + "context_length": "max_position_embeddings", + "block_count": "num_hidden_layers", + "feed_forward_length": "intermediate_size", + "embedding_length": "hidden_size", + "rope.dimension_count": None, + "rope.freq_base": "rope_theta", + "attention.head_count": "num_attention_heads", + "attention.head_count_kv": "num_key_value_heads", + "attention.layer_norm_rms_epsilon": "norm_eps", + "vocab_size": "vocab_size", + }, } GGUF_TOKENIZER_MAPPING = { @@ -793,6 +819,7 @@ GGUF_TO_FAST_CONVERTERS = { "starcoder2": GGUFGPTConverter, "t5": GGUFT5Converter, "mamba": GGUFGPTConverter, + "nemotron": GGUFGPTConverter, } diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py index 84278e7032..42b05f1844 100644 --- a/tests/quantization/ggml/test_ggml.py +++ b/tests/quantization/ggml/test_ggml.py @@ -61,6 +61,8 @@ class GgufIntegrationTests(unittest.TestCase): starcoder2_original_model_id = "bigcode/starcoder2-3b" mamba_original_model_id = "state-spaces/mamba-2.8b-hf" mamba_model_id = "jpodivin/mamba-2.8b-hf-GGUF" + nemotron_original_model_id = "nvidia/Nemotron-Mini-4B-Instruct" + nemotron_model_id = "bartowski/Nemotron-Mini-4B-Instruct-GGUF" # standard quants q4_0_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf" @@ -106,6 +108,8 @@ class GgufIntegrationTests(unittest.TestCase): fp16_starcoder2_gguf_model_id = "starcoder2-3b.fp16.gguf" q6_k_mamba_model_id = "ggml-model-Q6_K.gguf" fp16_mamba_model_id = "ggml-model-f16.gguf" + q6_k_nemotron_model_id = "Nemotron-Mini-4B-Instruct-Q6_K.gguf" + fp16_nemotron_model_id = "Nemotron-Mini-4B-Instruct-f16.gguf" example_text = "Hello" @@ -792,6 +796,42 @@ class GgufIntegrationTests(unittest.TestCase): EXPECTED_TEXT = "Hello,I answerthe question.\n\nA" self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) + def test_nemotron_weights_conversion_fp16(self): + original_model = AutoModelForCausalLM.from_pretrained( + self.nemotron_original_model_id, + torch_dtype=torch.float16, + ) + + converted_model = AutoModelForCausalLM.from_pretrained( + self.nemotron_model_id, + gguf_file=self.fp16_nemotron_model_id, + torch_dtype=torch.float16, + ) + + converted_state_dict = converted_model.state_dict() + original_state_dict = original_model.state_dict() + + for layer_name, original_params in original_state_dict.items(): + if layer_name in converted_state_dict: + self.assertTrue(original_params.shape == converted_state_dict[layer_name].shape) + torch.testing.assert_close(original_params, converted_state_dict[layer_name]) + else: + raise ValueError(f"Layer {layer_name} is not presented in GGUF model") + + def test_nemotron_q6_k(self): + model = AutoModelForCausalLM.from_pretrained( + self.nemotron_model_id, + gguf_file=self.q6_k_nemotron_model_id, + torch_dtype=torch.float16, + ) + + tokenizer = AutoTokenizer.from_pretrained(self.nemotron_model_id, gguf_file=self.q6_k_nemotron_model_id) + text = tokenizer(self.example_text, return_tensors="pt")["input_ids"] + out = model.generate(text, max_new_tokens=10) + + EXPECTED_TEXT = "'Hello. hotmail.com.'" + self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) + def test_tokenization_xnli(self): import tqdm from datasets import load_dataset