GGUF: Fix llama 3 GGUF (#31358)

* Create push-important-models.yml

* llama3 support for GGUF

* fixup

* Update src/transformers/integrations/ggml.py

* fix pre-tokenizer

* fix

* fix

* fix

* fix

* fix

* fix

* address final comment

* handle special tokens + add tests
This commit is contained in:
Younes Belkada
2024-06-20 14:29:58 +02:00
committed by GitHub
parent 35b112d344
commit 6d4306160a
5 changed files with 106 additions and 18 deletions

View File

@@ -21,7 +21,7 @@ with extra methods beings exposed
from array import array from array import array
import numpy as np import numpy as np
from tokenizers import Tokenizer, decoders from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers
from tokenizers.models import BPE from tokenizers.models import BPE
from .. import AddedToken from .. import AddedToken
@@ -540,15 +540,26 @@ class GGUFTokenizerSkeleton:
self.merges = merges self.merges = merges
else: else:
self.merges = [tuple(merge.split(" ")) for merge in self.merges] self.merges = [tuple(merge.split(" ")) for merge in self.merges]
if not hasattr(self, "scores"):
self.scores = [None for _ in range(len(self.tokens))]
if not hasattr(self, "added_tokens"): if not hasattr(self, "added_tokens"):
self.added_tokens = [] self.added_tokens = []
if not hasattr(self, "unk_token_id"):
self.unk_token_id = None
# Llama2 uses the field `unknown_token_id`
if hasattr(self, "unknown_token_id") and self.unk_token_id is None:
self.unk_token_id = self.unknown_token_id
class GGUFLlamaConverter(LlamaConverter): class GGUFLlamaConverter(LlamaConverter):
def __init__(self, tokenizer_dict): def __init__(self, tokenizer_dict):
self.proto = GGUFTokenizerSkeleton(tokenizer_dict) self.proto = GGUFTokenizerSkeleton(tokenizer_dict)
self.original_tokenizer = self.proto self.original_tokenizer = self.proto
self.additional_kwargs = {}
self.is_llama_3_tokenizer = getattr(self.proto, "tokenizer_type", "llama") != "llama"
def vocab(self, proto): def vocab(self, proto):
return list(zip(proto.tokens, proto.scores)) return list(zip(proto.tokens, proto.scores))
@@ -560,22 +571,50 @@ class GGUFLlamaConverter(LlamaConverter):
vocab_scores = self.vocab(self.proto) vocab_scores = self.vocab(self.proto)
merges = self.merges(self.proto) merges = self.merges(self.proto)
bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)} bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
tokenizer = Tokenizer(
BPE(bpe_vocab, merges, unk_token=proto.tokens[proto.unk_token_id], fuse_unk=True, byte_fallback=True) unk_token = proto.tokens[proto.unk_token_id] if proto.unk_token_id is not None else None
) bos_token = proto.tokens[proto.bos_token_id] if getattr(proto, "bos_token_id", None) is not None else None
tokenizer.add_special_tokens( eos_token = proto.tokens[proto.bos_token_id] if getattr(proto, "eos_token_id", None) is not None else None
[
AddedToken("<unk>", normalized=False, special=True), tokenizer = Tokenizer(BPE(bpe_vocab, merges, unk_token=unk_token, fuse_unk=True, byte_fallback=True))
AddedToken("<s>", normalized=False, special=True),
AddedToken("</s>", normalized=False, special=True), special_tokens = []
]
) if not hasattr(self.proto, "token_type"):
if unk_token is not None:
special_tokens.append(AddedToken(unk_token, normalized=False, special=True))
if bos_token is not None:
special_tokens.append(AddedToken(bos_token, normalized=False, special=True))
if eos_token is not None:
special_tokens.append(AddedToken(eos_token, normalized=False, special=True))
else:
# 3 stands for special tokens
special_tokens_idx = np.where(np.array(self.proto.token_type) == 3)[0]
for idx in special_tokens_idx:
special_tokens.append(AddedToken(self.proto.tokens[idx], normalized=False, special=True))
if len(special_tokens) != 0:
tokenizer.add_special_tokens(special_tokens)
if len(self.proto.added_tokens) != 0: if len(self.proto.added_tokens) != 0:
tokenizer.add_special_tokens( tokenizer.add_tokens(
[AddedToken(added_token, normalized=False, special=False) for added_token in self.added_tokens] [AddedToken(added_token, normalized=False, special=False) for added_token in self.proto.added_tokens]
) )
self.additional_kwargs["unk_token"] = unk_token
self.additional_kwargs["eos_token"] = bos_token
self.additional_kwargs["bos_token"] = eos_token
if self.is_llama_3_tokenizer:
self.additional_kwargs["add_prefix_space"] = False
self.additional_kwargs["clean_up_tokenization_spaces"] = True
self.additional_kwargs["legacy"] = False
self.original_tokenizer.legacy = False
return tokenizer return tokenizer
def decoder(self, replacement, add_prefix_space): def decoder(self, replacement, add_prefix_space):
@@ -584,14 +623,34 @@ class GGUFLlamaConverter(LlamaConverter):
decoders.Fuse(), decoders.Fuse(),
decoders.Replace("", " "), decoders.Replace("", " "),
] ]
if self.is_llama_3_tokenizer:
sequence += [decoders.ByteLevel(add_prefix_space=False, trim_offsets=False, use_regex=True)]
if add_prefix_space: if add_prefix_space:
sequence += [decoders.Strip(content=" ", left=1)] sequence += [decoders.Strip(content=" ", left=1)]
return decoders.Sequence(sequence) return decoders.Sequence(sequence)
def converted(self):
tokenizer = super().converted()
# HACK: patch the llama-3 tokenizer to use the correspinding pre-tokenizer
# and normalizer
if self.is_llama_3_tokenizer:
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
add_prefix_space=False, trim_offsets=False, use_regex=True
)
# This is tricky as the additional kwargs are passed after legacy is force-set in LlamaTokenizer's
# init.
tokenizer.normalizer = normalizers.Sequence([])
return tokenizer
class GGUFQwen2Converter(Qwen2Converter): class GGUFQwen2Converter(Qwen2Converter):
def __init__(self, tokenizer_dict): def __init__(self, tokenizer_dict):
self.original_tokenizer = GGUFTokenizerSkeleton(tokenizer_dict) self.original_tokenizer = GGUFTokenizerSkeleton(tokenizer_dict)
self.additional_kwargs = {}
def converted(self) -> Tokenizer: def converted(self) -> Tokenizer:
vocab = {word: i for i, word in enumerate(self.original_tokenizer.tokens)} vocab = {word: i for i, word in enumerate(self.original_tokenizer.tokens)}
@@ -629,5 +688,6 @@ def convert_gguf_tokenizer(architecture, tokenizer_dict) -> Tokenizer:
[`~tokenization_utils_base.PreTrainedTokenizerFast`] [`~tokenization_utils_base.PreTrainedTokenizerFast`]
""" """
tokenizer_class_name = architecture tokenizer_class_name = architecture
converter_class = GGUF_TO_FAST_CONVERTERS[tokenizer_class_name] converter = GGUF_TO_FAST_CONVERTERS[tokenizer_class_name](tokenizer_dict)
return converter_class(tokenizer_dict).converted() fast_tokenizer = converter.converted()
return fast_tokenizer, converter.additional_kwargs

View File

@@ -158,7 +158,8 @@ class LlamaTokenizer(PreTrainedTokenizer):
" expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you." " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
" If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it" " If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
" means, and thoroughly read the reason why this was added as explained in" " means, and thoroughly read the reason why this was added as explained in"
" https://github.com/huggingface/transformers/pull/24565" " https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file"
" you can ignore this message"
) )
legacy = True legacy = True

View File

@@ -145,7 +145,8 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
" expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you." " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
" If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it" " If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
" means, and thoroughly read the reason why this was added as explained in" " means, and thoroughly read the reason why this was added as explained in"
" https://github.com/huggingface/transformers/pull/24565" " https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file"
" you can ignore this message."
) )
legacy = True legacy = True
self.legacy = legacy self.legacy = legacy

View File

@@ -121,7 +121,11 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
gguf_param = load_gguf_checkpoint(kwargs.get("vocab_file")) gguf_param = load_gguf_checkpoint(kwargs.get("vocab_file"))
architecture = gguf_param["config"]["model_type"] architecture = gguf_param["config"]["model_type"]
tokenizer_dict = gguf_param["tokenizer"] tokenizer_dict = gguf_param["tokenizer"]
fast_tokenizer = convert_gguf_tokenizer(architecture, tokenizer_dict) fast_tokenizer, additional_kwargs = convert_gguf_tokenizer(architecture, tokenizer_dict)
if len(additional_kwargs) > 0:
kwargs.update(additional_kwargs)
elif self.slow_tokenizer_class is not None: elif self.slow_tokenizer_class is not None:
# We need to create and convert a slow tokenizer to build the backend # We need to create and convert a slow tokenizer to build the backend
slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs) slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs)
@@ -184,6 +188,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
tokens_to_add += [ tokens_to_add += [
token for token in self.all_special_tokens_extended if token not in encoder and token not in tokens_to_add token for token in self.all_special_tokens_extended if token not in encoder and token not in tokens_to_add
] ]
if len(tokens_to_add) > 0: if len(tokens_to_add) > 0:
# super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ # super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ
# Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for # Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for

View File

@@ -32,6 +32,7 @@ class GgufIntegrationTests(unittest.TestCase):
model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
mistral_model_id = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF" mistral_model_id = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
qwen2_model_id = "Qwen/Qwen1.5-0.5B-Chat-GGUF" qwen2_model_id = "Qwen/Qwen1.5-0.5B-Chat-GGUF"
llama3_model_id = "NousResearch/Meta-Llama-3-8B-GGUF"
q4_0_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf" q4_0_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf"
q4_k_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" q4_k_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
@@ -43,6 +44,7 @@ class GgufIntegrationTests(unittest.TestCase):
q4_0_mistral_model_id = "mistral-7b-instruct-v0.2.Q4_0.gguf" q4_0_mistral_model_id = "mistral-7b-instruct-v0.2.Q4_0.gguf"
q4_0_qwen2_model_id = "qwen1_5-0_5b-chat-q4_0.gguf" q4_0_qwen2_model_id = "qwen1_5-0_5b-chat-q4_0.gguf"
q4_llama3_model_id = "Meta-Llama-3-8B-Q4_K_M.gguf"
example_text = "Hello" example_text = "Hello"
@@ -171,6 +173,25 @@ class GgufIntegrationTests(unittest.TestCase):
EXPECTED_TEXT = "Hello.jsoup\n\nI am a beginner" EXPECTED_TEXT = "Hello.jsoup\n\nI am a beginner"
self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
def test_llama3_q4_0_tokenizer(self):
tokenizer_gguf = AutoTokenizer.from_pretrained(self.llama3_model_id, gguf_file=self.q4_llama3_model_id)
special_sentence = "สวัสดี"
predicted_text = tokenizer_gguf.decode(tokenizer_gguf.encode(special_sentence, return_tensors="pt")[0])
self.assertEqual(predicted_text, "<|begin_of_text|>" + special_sentence)
def test_llama3_q4_0(self):
tokenizer = AutoTokenizer.from_pretrained(self.llama3_model_id, gguf_file=self.q4_llama3_model_id)
model = AutoModelForCausalLM.from_pretrained(
self.llama3_model_id, gguf_file=self.q4_llama3_model_id, device_map="auto", torch_dtype=torch.float16
)
text = tokenizer(self.example_text, return_tensors="pt").to(torch_device)
out = model.generate(**text, max_new_tokens=10)
EXPECTED_TEXT = "Hello, I am new to this forum. I am"
self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT)
def test_tokenization_xnli(self): def test_tokenization_xnli(self):
import tqdm import tqdm
from datasets import load_dataset from datasets import load_dataset