Fix : Nemotron tokenizer for GGUF format (#35836)

fix nemotron gguf
This commit is contained in:
Mohamed Mekkouri
2025-01-22 12:28:40 +01:00
committed by GitHub
parent ec28957f94
commit a7738f5a89
2 changed files with 3 additions and 2 deletions

View File

@@ -339,6 +339,7 @@ else:
("musicgen_melody", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)), ("musicgen_melody", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
("mvp", ("MvpTokenizer", "MvpTokenizerFast" if is_tokenizers_available() else None)), ("mvp", ("MvpTokenizer", "MvpTokenizerFast" if is_tokenizers_available() else None)),
("myt5", ("MyT5Tokenizer", None)), ("myt5", ("MyT5Tokenizer", None)),
("nemotron", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
("nezha", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("nezha", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
( (
"nllb", "nllb",

View File

@@ -835,9 +835,9 @@ class GgufIntegrationTests(unittest.TestCase):
tokenizer = AutoTokenizer.from_pretrained(self.nemotron_model_id, gguf_file=self.q6_k_nemotron_model_id) tokenizer = AutoTokenizer.from_pretrained(self.nemotron_model_id, gguf_file=self.q6_k_nemotron_model_id)
text = tokenizer(self.example_text, return_tensors="pt")["input_ids"] text = tokenizer(self.example_text, return_tensors="pt")["input_ids"]
out = model.generate(text, max_new_tokens=10) out = model.generate(text, max_new_tokens=16)
EXPECTED_TEXT = "'Hello. hotmail.com.'" EXPECTED_TEXT = "Hello.hotmail.com</s>"
self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
def test_gemma2_q3_k(self): def test_gemma2_q3_k(self):