From 6d4306160ab22c54a677e07f88c7d8808b137d38 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 20 Jun 2024 14:29:58 +0200
Subject: [PATCH] GGUF: Fix llama 3 GGUF (#31358)

* Create push-important-models.yml

* llama3 support for GGUF

* fixup

* Update src/transformers/integrations/ggml.py

* fix pre-tokenizer

* fix

* fix

* fix

* fix

* fix

* fix

* address final comment

* handle special tokens + add tests
---
 src/transformers/integrations/ggml.py         | 90 +++++++++++++++----
 .../models/llama/tokenization_llama.py        |  3 +-
 .../models/llama/tokenization_llama_fast.py   |  3 +-
 src/transformers/tokenization_utils_fast.py   |  7 +-
 tests/quantization/ggml/test_ggml.py          | 21 +++++
 5 files changed, 106 insertions(+), 18 deletions(-)
diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py
index 9aa7e97f78..be953ef08b 100644
--- a/src/transformers/integrations/ggml.py
+++ b/src/transformers/integrations/ggml.py
@@ -21,7 +21,7 @@ with extra methods beings exposed
 from array import array
 
 import numpy as np
-from tokenizers import Tokenizer, decoders
+from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers
 from tokenizers.models import BPE
 
 from .. import AddedToken
@@ -540,15 +540,26 @@ class GGUFTokenizerSkeleton:
             self.merges = merges
         else:
             self.merges = [tuple(merge.split(" ")) for merge in self.merges]
+            if not hasattr(self, "scores"):
+                self.scores = [None for _ in range(len(self.tokens))]
 
         if not hasattr(self, "added_tokens"):
             self.added_tokens = []
 
+        if not hasattr(self, "unk_token_id"):
+            self.unk_token_id = None
+
+        # Llama2 uses the field `unknown_token_id`
+        if hasattr(self, "unknown_token_id") and self.unk_token_id is None:
+            self.unk_token_id = self.unknown_token_id
+
 
 class GGUFLlamaConverter(LlamaConverter):
     def __init__(self, tokenizer_dict):
         self.proto = GGUFTokenizerSkeleton(tokenizer_dict)
         self.original_tokenizer = self.proto
+        self.additional_kwargs = {}
+        self.is_llama_3_tokenizer = getattr(self.proto, "tokenizer_type", "llama") != "llama"
 
     def vocab(self, proto):
         return list(zip(proto.tokens, proto.scores))
@@ -560,22 +571,50 @@ class GGUFLlamaConverter(LlamaConverter):
         vocab_scores = self.vocab(self.proto)
         merges = self.merges(self.proto)
         bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
-        tokenizer = Tokenizer(
-            BPE(bpe_vocab, merges, unk_token=proto.tokens[proto.unk_token_id], fuse_unk=True, byte_fallback=True)
-        )
-        tokenizer.add_special_tokens(
-            [
-                AddedToken("<unk>", normalized=False, special=True),
-                AddedToken("<s>", normalized=False, special=True),
-                AddedToken("</s>", normalized=False, special=True),
-            ]
-        )
+
+        unk_token = proto.tokens[proto.unk_token_id] if proto.unk_token_id is not None else None
+        bos_token = proto.tokens[proto.bos_token_id] if getattr(proto, "bos_token_id", None) is not None else None
+        eos_token = proto.tokens[proto.bos_token_id] if getattr(proto, "eos_token_id", None) is not None else None
+
+        tokenizer = Tokenizer(BPE(bpe_vocab, merges, unk_token=unk_token, fuse_unk=True, byte_fallback=True))
+
+        special_tokens = []
+
+        if not hasattr(self.proto, "token_type"):
+            if unk_token is not None:
+                special_tokens.append(AddedToken(unk_token, normalized=False, special=True))
+
+            if bos_token is not None:
+                special_tokens.append(AddedToken(bos_token, normalized=False, special=True))
+
+            if eos_token is not None:
+                special_tokens.append(AddedToken(eos_token, normalized=False, special=True))
+        else:
+            # 3 stands for special tokens
+            special_tokens_idx = np.where(np.array(self.proto.token_type) == 3)[0]
+
+            for idx in special_tokens_idx:
+                special_tokens.append(AddedToken(self.proto.tokens[idx], normalized=False, special=True))
+
+        if len(special_tokens) != 0:
+            tokenizer.add_special_tokens(special_tokens)
 
         if len(self.proto.added_tokens) != 0:
-            tokenizer.add_special_tokens(
-                [AddedToken(added_token, normalized=False, special=False) for added_token in self.added_tokens]
+            tokenizer.add_tokens(
+                [AddedToken(added_token, normalized=False, special=False) for added_token in self.proto.added_tokens]
             )
 
+        self.additional_kwargs["unk_token"] = unk_token
+        self.additional_kwargs["eos_token"] = bos_token
+        self.additional_kwargs["bos_token"] = eos_token
+
+        if self.is_llama_3_tokenizer:
+            self.additional_kwargs["add_prefix_space"] = False
+            self.additional_kwargs["clean_up_tokenization_spaces"] = True
+
+            self.additional_kwargs["legacy"] = False
+            self.original_tokenizer.legacy = False
+
         return tokenizer
 
     def decoder(self, replacement, add_prefix_space):
@@ -584,14 +623,34 @@ class GGUFLlamaConverter(LlamaConverter):
             decoders.Fuse(),
             decoders.Replace("▁", " "),
         ]
+
+        if self.is_llama_3_tokenizer:
+            sequence += [decoders.ByteLevel(add_prefix_space=False, trim_offsets=False, use_regex=True)]
+
         if add_prefix_space:
             sequence += [decoders.Strip(content=" ", left=1)]
         return decoders.Sequence(sequence)
 
+    def converted(self):
+        tokenizer = super().converted()
+
+        # HACK: patch the llama-3 tokenizer to use the correspinding pre-tokenizer
+        # and normalizer
+        if self.is_llama_3_tokenizer:
+            tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
+                add_prefix_space=False, trim_offsets=False, use_regex=True
+            )
+            # This is tricky as the additional kwargs are passed after legacy is force-set in LlamaTokenizer's
+            # init.
+            tokenizer.normalizer = normalizers.Sequence([])
+
+        return tokenizer
+
 
 class GGUFQwen2Converter(Qwen2Converter):
     def __init__(self, tokenizer_dict):
         self.original_tokenizer = GGUFTokenizerSkeleton(tokenizer_dict)
+        self.additional_kwargs = {}
 
     def converted(self) -> Tokenizer:
         vocab = {word: i for i, word in enumerate(self.original_tokenizer.tokens)}
@@ -629,5 +688,6 @@ def convert_gguf_tokenizer(architecture, tokenizer_dict) -> Tokenizer:
         [`~tokenization_utils_base.PreTrainedTokenizerFast`]
     """
     tokenizer_class_name = architecture
-    converter_class = GGUF_TO_FAST_CONVERTERS[tokenizer_class_name]
-    return converter_class(tokenizer_dict).converted()
+    converter = GGUF_TO_FAST_CONVERTERS[tokenizer_class_name](tokenizer_dict)
+    fast_tokenizer = converter.converted()
+    return fast_tokenizer, converter.additional_kwargs
diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
index 5392afb763..80865ba98d 100644
--- a/src/transformers/models/llama/tokenization_llama.py
+++ b/src/transformers/models/llama/tokenization_llama.py
@@ -158,7 +158,8 @@ class LlamaTokenizer(PreTrainedTokenizer):
                 " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
                 " If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
                 " means, and thoroughly read the reason why this was added as explained in"
-                " https://github.com/huggingface/transformers/pull/24565"
+                " https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file"
+                " you can ignore this message"
             )
             legacy = True
 
diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py
index 44168fbedc..91d3bf3615 100644
--- a/src/transformers/models/llama/tokenization_llama_fast.py
+++ b/src/transformers/models/llama/tokenization_llama_fast.py
@@ -145,7 +145,8 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
                 " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
                 " If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
                 " means, and thoroughly read the reason why this was added as explained in"
-                " https://github.com/huggingface/transformers/pull/24565"
+                " https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file"
+                " you can ignore this message."
             )
             legacy = True
         self.legacy = legacy
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index 597f537cc8..c414b20da2 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -121,7 +121,11 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
             gguf_param = load_gguf_checkpoint(kwargs.get("vocab_file"))
             architecture = gguf_param["config"]["model_type"]
             tokenizer_dict = gguf_param["tokenizer"]
-            fast_tokenizer = convert_gguf_tokenizer(architecture, tokenizer_dict)
+            fast_tokenizer, additional_kwargs = convert_gguf_tokenizer(architecture, tokenizer_dict)
+
+            if len(additional_kwargs) > 0:
+                kwargs.update(additional_kwargs)
+
         elif self.slow_tokenizer_class is not None:
             # We need to create and convert a slow tokenizer to build the backend
             slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs)
@@ -184,6 +188,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
         tokens_to_add += [
             token for token in self.all_special_tokens_extended if token not in encoder and token not in tokens_to_add
         ]
+
         if len(tokens_to_add) > 0:
             # super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ
             # Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py
index 679754db38..e5e8dbaf36 100644
--- a/tests/quantization/ggml/test_ggml.py
+++ b/tests/quantization/ggml/test_ggml.py
@@ -32,6 +32,7 @@ class GgufIntegrationTests(unittest.TestCase):
     model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
     mistral_model_id = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
     qwen2_model_id = "Qwen/Qwen1.5-0.5B-Chat-GGUF"
+    llama3_model_id = "NousResearch/Meta-Llama-3-8B-GGUF"
 
     q4_0_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf"
     q4_k_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
@@ -43,6 +44,7 @@ class GgufIntegrationTests(unittest.TestCase):
 
     q4_0_mistral_model_id = "mistral-7b-instruct-v0.2.Q4_0.gguf"
     q4_0_qwen2_model_id = "qwen1_5-0_5b-chat-q4_0.gguf"
+    q4_llama3_model_id = "Meta-Llama-3-8B-Q4_K_M.gguf"
 
     example_text = "Hello"
 
@@ -171,6 +173,25 @@ class GgufIntegrationTests(unittest.TestCase):
         EXPECTED_TEXT = "Hello.jsoup\n\nI am a beginner"
         self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
 
+    def test_llama3_q4_0_tokenizer(self):
+        tokenizer_gguf = AutoTokenizer.from_pretrained(self.llama3_model_id, gguf_file=self.q4_llama3_model_id)
+        special_sentence = "สวัสดี"
+        predicted_text = tokenizer_gguf.decode(tokenizer_gguf.encode(special_sentence, return_tensors="pt")[0])
+        self.assertEqual(predicted_text, "<|begin_of_text|>" + special_sentence)
+
+    def test_llama3_q4_0(self):
+        tokenizer = AutoTokenizer.from_pretrained(self.llama3_model_id, gguf_file=self.q4_llama3_model_id)
+        model = AutoModelForCausalLM.from_pretrained(
+            self.llama3_model_id, gguf_file=self.q4_llama3_model_id, device_map="auto", torch_dtype=torch.float16
+        )
+
+        text = tokenizer(self.example_text, return_tensors="pt").to(torch_device)
+        out = model.generate(**text, max_new_tokens=10)
+
+        EXPECTED_TEXT = "Hello, I am new to this forum. I am"
+
+        self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
+
     def test_tokenization_xnli(self):
         import tqdm
         from datasets import load_dataset