[Core tokenization] add_dummy_prefix_space option to help with latest issues (#28010)

* add add_dummy_prefix_space option to slow

* checking kwargs might be better. Should be there for all spm tokenizer IMO

* nits

* fix copies

* more copied

* nits

* add prefix space

* nit

* nits

* Update src/transformers/convert_slow_tokenizer.py

* fix inti

* revert wrong styling

* fix

* nits

* style

* updates

* make sure we use slow tokenizer for conversion instead of looking for the decoder

* support llama ast well

* update llama tokenizer fast

* nits

* nits nits nits

* update the doc

* update

* update to fix tests

* skip unrelated tailing test

* Update src/transformers/convert_slow_tokenizer.py

* add proper testing

* test decode as well

* more testing

* format

* fix llama test

* Apply suggestions from code review
This commit is contained in:
Arthur
2024-02-20 12:50:31 +01:00
committed by GitHub
parent efdd436663
commit 15cfe38942
10 changed files with 136 additions and 25 deletions

View File

@@ -306,6 +306,34 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def test_subword_regularization_tokenizer(self):
pass
def test_add_prefix_space(self):
pretrained_name = "hf-internal-testing/llama-tokenizer-non-normalized"
inputs = "Hey how are you doing"
EXPECTED_WITH_SPACE = [1, 18637, 920, 526, 366, 2599]
EXPECTED_WO_SPACE = [1, 29950, 1032, 920, 526, 366, 2599]
slow_ = self.tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=False, legacy=False)
fast_ = self.rust_tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=False, legacy=False)
self.assertEqual(slow_.encode(inputs), EXPECTED_WO_SPACE)
self.assertEqual(slow_.encode(inputs), fast_.encode(inputs))
self.assertEqual(slow_.tokenize(inputs), ["H", "ey", "▁how", "▁are", "▁you", "▁doing"])
self.assertEqual(slow_.decode(EXPECTED_WO_SPACE, skip_special_tokens=True), inputs)
self.assertEqual(
slow_.decode(EXPECTED_WO_SPACE, skip_special_tokens=True),
fast_.decode(EXPECTED_WO_SPACE, skip_special_tokens=True),
)
slow_ = self.tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=True, legacy=False)
fast_ = self.rust_tokenizer_class.from_pretrained(pretrained_name, add_prefix_space=True, legacy=False)
self.assertEqual(slow_.encode(inputs), EXPECTED_WITH_SPACE)
self.assertEqual(slow_.encode(inputs), fast_.encode(inputs))
self.assertEqual(slow_.tokenize(inputs), ["▁Hey", "▁how", "▁are", "▁you", "▁doing"])
self.assertEqual(slow_.decode(EXPECTED_WITH_SPACE, skip_special_tokens=True), inputs)
self.assertEqual(
slow_.decode(EXPECTED_WITH_SPACE, skip_special_tokens=True),
fast_.decode(EXPECTED_WITH_SPACE, skip_special_tokens=True),
)
@require_torch
@require_sentencepiece