Support reading tiktoken tokenizer.model file (#31656)

* use existing TikTokenConverter to read tiktoken tokenizer.model file

* del test file

* create titktoken integration file

* adding tiktoken llama test

* ALTNATIVE IMPLEMENTATION: supports llama 405B

* fix one char

* remove redundant line

* small fix

* rm unused import

* flag for converting from tiktokeng

* remove unneeded file

* ruff

* remove llamatiktokenconverter, stick to general converter

* tiktoken support v2

* update test

* remove stale changes

* udpate doc

* protect import

* use is_protobuf_available

* add templateprocessor in tiktokenconverter

* reverting templateprocessor from tiktoken support

* update test

* add require_tiktoken

* dev-ci

* trigger build

* trigger build again

* dev-ci

* [build-ci-image] tiktoken

* dev-ci

* dev-ci

* dev-ci

* dev-ci

* change tiktoken file name

* feedback review

* feedback rev

* applying feedback, removing tiktoken converters

* conform test

* adding docs for review

* add doc file for review

* add doc file for review

* add doc file for review

* support loading model without config.json file

* Revert "support loading model without config.json file"

This reverts commit 2753602e51c34cef2f184eb11f36d2ad1b02babb.

* remove dev var

* updating docs

* safely import protobuf

* fix protobuf import error

* fix protobuf import error

* trying isort to fix ruff error

* fix ruff error

* try to fix ruff again

* try to fix ruff again

* try to fix ruff again

* doc table of contents

* add fix for consistency.dockerfile torchaudio

* ruff

* applying feedback

* minor typo

* merging with push-ci-image

* clean up imports

* revert dockerfile consistency
This commit is contained in:
Ita Zaporozhets
2024-09-06 08:24:02 -04:00
committed by GitHub
parent 342e800086
commit e48e5f1f13
13 changed files with 195 additions and 21 deletions

View File

@@ -25,15 +25,19 @@ from huggingface_hub import hf_hub_download
from transformers import (
SPIECE_UNDERLINE,
AddedToken,
AutoTokenizer,
LlamaTokenizer,
LlamaTokenizerFast,
PreTrainedTokenizerFast,
)
from transformers.convert_slow_tokenizer import convert_slow_tokenizer
from transformers.testing_utils import (
get_tests_dir,
nested_simplify,
require_jinja,
require_read_token,
require_sentencepiece,
require_tiktoken,
require_tokenizers,
require_torch,
slow,
@@ -832,3 +836,66 @@ class CommonSpmIntegrationTests(unittest.TestCase):
self.assertEqual(input_ids, [284, 1, 156])
tokens = self.tokenizer.tokenize("No <s> ▁He")
self.assertEqual(tokens, ["▁No", "<s>", "▁He"]) # spaces are eaten by rstrip / lstrip
@require_tiktoken
@require_read_token
class TikTokenIntegrationTests(unittest.TestCase):
"""
A class that regroups important test to make sure that we properly handle the special tokens.
"""
def test_tiktoken_llama(self):
model_path = "hf-internal-testing/Llama3-Instruct-Internal"
test_text = "This is a test sentence."
test_tokens = [128000, 2028, 374, 264, 1296, 11914, 13, 128001]
num_reserved_special_tokens = 256
special_tokens = [
"<|begin_of_text|>",
"<|end_of_text|>",
"<|reserved_special_token_0|>",
"<|reserved_special_token_1|>",
"<|reserved_special_token_2|>",
"<|reserved_special_token_3|>",
"<|start_header_id|>",
"<|end_header_id|>",
"<|reserved_special_token_4|>",
"<|eot_id|>",
"<|python_tag|>", # end of turn
] + [f"<|reserved_special_token_{i}|>" for i in range(5, num_reserved_special_tokens - 5)]
tiktoken_tokenizer = PreTrainedTokenizerFast.from_pretrained(
model_path,
additional_special_tokens=special_tokens,
bos_token="<|begin_of_text|>",
eos_token="<|end_of_text|>",
)
tokens = tiktoken_tokenizer.tokenize("<|begin_of_text|> " + test_text)
self.assertEqual(tokens[0], "<|begin_of_text|>")
tiktoken_tokenizer = AutoTokenizer.from_pretrained(
model_path, legacy=False, additional_special_tokens=special_tokens, add_bos_token=True, add_eos_token=True
)
self.assertTrue(isinstance(tiktoken_tokenizer, PreTrainedTokenizerFast))
tokens = tiktoken_tokenizer.encode(test_text, add_special_tokens=True)
self.assertEqual(tokens, test_tokens)
tmpdirname = tempfile.mkdtemp()
tiktoken_tokenizer.save_pretrained(tmpdirname)
tokenizer_reload = AutoTokenizer.from_pretrained(tmpdirname)
self.assertTrue(isinstance(tokenizer_reload, PreTrainedTokenizerFast))
tokens = tokenizer_reload.encode(test_text, add_special_tokens=True)
self.assertEqual(tokens, test_tokens)
shutil.rmtree(tmpdirname)
tiktoken_tokenizer = AutoTokenizer.from_pretrained(
model_path,
additional_special_tokens=special_tokens,
from_slow=True,
add_bos_token=True,
add_eos_token=True,
)
tokens = tiktoken_tokenizer.encode(test_text, add_special_tokens=True)
self.assertEqual(tokens, test_tokens)