Adding Llama FastTokenizer support. (#22264)

* Adding Llama FastTokenizer support.

- Requires https://github.com/huggingface/tokenizers/pull/1183 version
- Only support byte_fallback for llama, raise otherwise (safety net).
- Lots of questions are special tokens

How to test:

```python

from transformers.convert_slow_tokenizer import convert_slow_tokenizer
from transformers import AutoTokenizer
from tokenizers import Tokenizer

tokenizer = AutoTokenizer.from_pretrained("huggingface/llama-7b")

if False:
    new_tokenizer = Tokenizer.from_file("tok.json")
else:
    new_tokenizer = convert_slow_tokenizer(tokenizer)
    new_tokenizer.save("tok.json")

strings = [
    "This is a test",
    "生活的真谛是",
    "生活的真谛是[MASK]。",
    # XXX: This one is problematic because of special tokens
    # "<s> Something something",
]

for string in strings:
    encoded = tokenizer(string)["input_ids"]
    encoded2 = new_tokenizer.encode(string).ids

    assert encoded == encoded2, f"{encoded} != {encoded2}"

    decoded = tokenizer.decode(encoded)
    decoded2 = new_tokenizer.decode(encoded2)

    assert decoded.strip() == decoded2, f"{repr(decoded)} != {repr(decoded2)}"
```

The converter + some test script.

The test script.

Tmp save.

Adding Fast tokenizer + tests.

Adding the tokenization tests.

Correct combination.

Small fix.

Fixing tests.

Fixing with latest update.

Rebased.

fix copies + normalized added tokens  + copies.

Adding doc.

TMP.

Doc + split files.

Doc.

Versions + try import.

Fix Camembert + warnings -> Error.

Fix by ArthurZucker.

Not a decorator.

* Fixing comments.

* Adding more to docstring.

* Doc rewriting.
This commit is contained in:
Nicolas Patry
2023-04-06 09:53:03 +02:00
committed by GitHub
parent 1564189298
commit 1670be4bde
11 changed files with 267 additions and 25 deletions

View File

@@ -1,3 +1,4 @@
# coding=utf-8
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -23,8 +24,10 @@ from transformers import (
SPIECE_UNDERLINE,
AddedToken,
LlamaTokenizer,
LlamaTokenizerFast,
is_torch_available,
)
from transformers.convert_slow_tokenizer import convert_slow_tokenizer
from transformers.testing_utils import (
get_tests_dir,
nested_simplify,
@@ -287,13 +290,11 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@require_sentencepiece
@require_tokenizers
class LlamaIntegrationTest(unittest.TestCase):
checkpoint_name = "hf-internal-testing/llama-tokenizer"
@classmethod
def setUpClass(cls):
cls.tokenizer: LlamaTokenizer = LlamaTokenizer.from_pretrained(cls.checkpoint_name)
cls.rust_tokenizer = cls.tokenizer # TODO @narsil replace with the rust one
cls.pad_token_id = 1
checkpoint_name = "hf-internal-testing/llama-tokenizer"
cls.tokenizer: LlamaTokenizer = LlamaTokenizer.from_pretrained(checkpoint_name)
cls.rust_tokenizer = LlamaTokenizerFast.from_pretrained(checkpoint_name)
return cls
@require_torch
@@ -314,6 +315,27 @@ class LlamaIntegrationTest(unittest.TestCase):
},
)
@slow
def test_conversion(self):
# This is excruciatingly slow since it has to recreate the entire merge
# list from the original vocabulary in spm
self.rust_tokenizer.save_pretrained("./out")
with tempfile.TemporaryDirectory() as dirname:
self.rust_tokenizer.save_pretrained(dirname)
with open(os.path.join(dirname, "tokenizer.json"), "r") as f:
old_serialized = f.read()
new_tokenizer = convert_slow_tokenizer(self.tokenizer)
with tempfile.NamedTemporaryFile() as f:
new_tokenizer.save(f.name)
# Re-opening since `f` is in bytes.
new_serialized = open(f.name, "r").read()
with open("out_tokenizer.json", "w") as g:
g.write(new_serialized)
self.assertEqual(old_serialized, new_serialized)
def test_simple_encode_decode(self):
pyth_tokenizer = self.tokenizer
rust_tokenizer = self.rust_tokenizer
@@ -362,11 +384,27 @@ class LlamaIntegrationTest(unittest.TestCase):
self.assertEqual(pyth_tokenizer.encode(" Hello"), [1, 29871, 15043])
self.assertEqual(rust_tokenizer.encode(" Hello"), [1, 29871, 15043])
def test_no_differences_showcase(self):
pyth_tokenizer = self.tokenizer
rust_tokenizer = self.rust_tokenizer
self.assertEqual(pyth_tokenizer.encode(""), [1])
self.assertEqual(rust_tokenizer.encode(""), [1])
self.assertEqual(pyth_tokenizer.encode(" "), [1, 259])
self.assertEqual(rust_tokenizer.encode(" "), [1, 259])
self.assertEqual(pyth_tokenizer.encode(" "), [1, 1678])
self.assertEqual(rust_tokenizer.encode(" "), [1, 1678])
self.assertEqual(pyth_tokenizer.encode(" Hello"), [1, 29871, 15043])
self.assertEqual(rust_tokenizer.encode(" Hello"), [1, 29871, 15043])
self.assertEqual(pyth_tokenizer.encode("<s>"), [1, 1])
self.assertEqual(rust_tokenizer.encode("<s>"), [1, 1])
self.assertEqual(pyth_tokenizer.encode(""), [1])
self.assertEqual(rust_tokenizer.encode(""), [1])
def test_no_differences_decode(self):
pyth_tokenizer = self.tokenizer
rust_tokenizer = self.rust_tokenizer
self.assertEqual(pyth_tokenizer.decode([869]), ".")
self.assertEqual(rust_tokenizer.decode([869]), ".")
@@ -374,6 +412,15 @@ class LlamaIntegrationTest(unittest.TestCase):
self.assertEqual(pyth_tokenizer.decode([30112, 869]), "ا .")
self.assertEqual(rust_tokenizer.decode([30112, 869]), "ا .")
def test_no_differences_special_tokens(self):
pyth_tokenizer = self.tokenizer
rust_tokenizer = self.rust_tokenizer
self.assertEqual(pyth_tokenizer.encode(""), [1])
self.assertEqual(rust_tokenizer.encode(""), [1])
self.assertEqual(pyth_tokenizer.encode("<s>"), [1, 1])
self.assertEqual(rust_tokenizer.encode("<s>"), [1, 1])
@unittest.skipIf(
os.getenv("RUN_TOKENIZER_INTEGRATION", "0") == "0",
"RUN_TOKENIZER_INTEGRATION=1 to run tokenizer integration tests",
@@ -392,8 +439,8 @@ class LlamaIntegrationTest(unittest.TestCase):
self.assertEqual(encoded1, encoded2)
decoded1 = pyth_tokenizer.decode(encoded1)
decoded2 = rust_tokenizer.decode(encoded2)
decoded1 = pyth_tokenizer.decode(encoded1, skip_special_tokens=True)
decoded2 = rust_tokenizer.decode(encoded2, skip_special_tokens=True)
self.assertEqual(decoded1, decoded2)
@@ -406,7 +453,7 @@ class LlamaIntegrationTest(unittest.TestCase):
self.assertEqual(encoded1, encoded2)
decoded1 = pyth_tokenizer.decode(encoded1)
decoded2 = rust_tokenizer.decode(encoded2)
decoded1 = pyth_tokenizer.decode(encoded1, skip_special_tokens=True)
decoded2 = rust_tokenizer.decode(encoded2, skip_special_tokens=True)
self.assertEqual(decoded1, decoded2)

View File

@@ -24,11 +24,10 @@ class ConvertSlowTokenizerTest(unittest.TestCase):
original_tokenizer_with_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_with_bytefallback)
with warnings.catch_warnings(record=True) as w:
with self.assertRaises(RuntimeError) as cm:
_ = SpmConverter(original_tokenizer_with_bytefallback)
self.assertEqual(len(w), 1)
self.assertIn(
"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
" which is not implemented in the fast tokenizers.",
str(w[0].message),
str(cm.exception),
)