Support reading tiktoken tokenizer.model file (#31656)
* use existing TikTokenConverter to read tiktoken tokenizer.model file * del test file * create titktoken integration file * adding tiktoken llama test * ALTNATIVE IMPLEMENTATION: supports llama 405B * fix one char * remove redundant line * small fix * rm unused import * flag for converting from tiktokeng * remove unneeded file * ruff * remove llamatiktokenconverter, stick to general converter * tiktoken support v2 * update test * remove stale changes * udpate doc * protect import * use is_protobuf_available * add templateprocessor in tiktokenconverter * reverting templateprocessor from tiktoken support * update test * add require_tiktoken * dev-ci * trigger build * trigger build again * dev-ci * [build-ci-image] tiktoken * dev-ci * dev-ci * dev-ci * dev-ci * change tiktoken file name * feedback review * feedback rev * applying feedback, removing tiktoken converters * conform test * adding docs for review * add doc file for review * add doc file for review * add doc file for review * support loading model without config.json file * Revert "support loading model without config.json file" This reverts commit 2753602e51c34cef2f184eb11f36d2ad1b02babb. * remove dev var * updating docs * safely import protobuf * fix protobuf import error * fix protobuf import error * trying isort to fix ruff error * fix ruff error * try to fix ruff again * try to fix ruff again * try to fix ruff again * doc table of contents * add fix for consistency.dockerfile torchaudio * ruff * applying feedback * minor typo * merging with push-ci-image * clean up imports * revert dockerfile consistency
This commit is contained in:
@@ -25,15 +25,19 @@ from huggingface_hub import hf_hub_download
|
||||
from transformers import (
|
||||
SPIECE_UNDERLINE,
|
||||
AddedToken,
|
||||
AutoTokenizer,
|
||||
LlamaTokenizer,
|
||||
LlamaTokenizerFast,
|
||||
PreTrainedTokenizerFast,
|
||||
)
|
||||
from transformers.convert_slow_tokenizer import convert_slow_tokenizer
|
||||
from transformers.testing_utils import (
|
||||
get_tests_dir,
|
||||
nested_simplify,
|
||||
require_jinja,
|
||||
require_read_token,
|
||||
require_sentencepiece,
|
||||
require_tiktoken,
|
||||
require_tokenizers,
|
||||
require_torch,
|
||||
slow,
|
||||
@@ -832,3 +836,66 @@ class CommonSpmIntegrationTests(unittest.TestCase):
|
||||
self.assertEqual(input_ids, [284, 1, 156])
|
||||
tokens = self.tokenizer.tokenize("No <s> ▁He")
|
||||
self.assertEqual(tokens, ["▁No", "<s>", "▁He"]) # spaces are eaten by rstrip / lstrip
|
||||
|
||||
|
||||
@require_tiktoken
|
||||
@require_read_token
|
||||
class TikTokenIntegrationTests(unittest.TestCase):
|
||||
"""
|
||||
A class that regroups important test to make sure that we properly handle the special tokens.
|
||||
"""
|
||||
|
||||
def test_tiktoken_llama(self):
|
||||
model_path = "hf-internal-testing/Llama3-Instruct-Internal"
|
||||
test_text = "This is a test sentence."
|
||||
test_tokens = [128000, 2028, 374, 264, 1296, 11914, 13, 128001]
|
||||
num_reserved_special_tokens = 256
|
||||
special_tokens = [
|
||||
"<|begin_of_text|>",
|
||||
"<|end_of_text|>",
|
||||
"<|reserved_special_token_0|>",
|
||||
"<|reserved_special_token_1|>",
|
||||
"<|reserved_special_token_2|>",
|
||||
"<|reserved_special_token_3|>",
|
||||
"<|start_header_id|>",
|
||||
"<|end_header_id|>",
|
||||
"<|reserved_special_token_4|>",
|
||||
"<|eot_id|>",
|
||||
"<|python_tag|>", # end of turn
|
||||
] + [f"<|reserved_special_token_{i}|>" for i in range(5, num_reserved_special_tokens - 5)]
|
||||
|
||||
tiktoken_tokenizer = PreTrainedTokenizerFast.from_pretrained(
|
||||
model_path,
|
||||
additional_special_tokens=special_tokens,
|
||||
bos_token="<|begin_of_text|>",
|
||||
eos_token="<|end_of_text|>",
|
||||
)
|
||||
tokens = tiktoken_tokenizer.tokenize("<|begin_of_text|> " + test_text)
|
||||
self.assertEqual(tokens[0], "<|begin_of_text|>")
|
||||
|
||||
tiktoken_tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_path, legacy=False, additional_special_tokens=special_tokens, add_bos_token=True, add_eos_token=True
|
||||
)
|
||||
self.assertTrue(isinstance(tiktoken_tokenizer, PreTrainedTokenizerFast))
|
||||
|
||||
tokens = tiktoken_tokenizer.encode(test_text, add_special_tokens=True)
|
||||
self.assertEqual(tokens, test_tokens)
|
||||
|
||||
tmpdirname = tempfile.mkdtemp()
|
||||
tiktoken_tokenizer.save_pretrained(tmpdirname)
|
||||
tokenizer_reload = AutoTokenizer.from_pretrained(tmpdirname)
|
||||
|
||||
self.assertTrue(isinstance(tokenizer_reload, PreTrainedTokenizerFast))
|
||||
tokens = tokenizer_reload.encode(test_text, add_special_tokens=True)
|
||||
self.assertEqual(tokens, test_tokens)
|
||||
shutil.rmtree(tmpdirname)
|
||||
|
||||
tiktoken_tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_path,
|
||||
additional_special_tokens=special_tokens,
|
||||
from_slow=True,
|
||||
add_bos_token=True,
|
||||
add_eos_token=True,
|
||||
)
|
||||
tokens = tiktoken_tokenizer.encode(test_text, add_special_tokens=True)
|
||||
self.assertEqual(tokens, test_tokens)
|
||||
|
||||
Reference in New Issue
Block a user