Big Bird Fast Tokenizer implementation (#11075)

* Added Big Bird Fast Tokenizer initial file

* style fixes

* flake fixes

* Added big bird fast tokenizer to init files

* Added big bird fast to Auto tokenization

* fix styles

* minor quality fixes

* Added initial test code

* Fix SpmConverter when precompiled_charsmap doesn't exist

* fixed post processor

* minor style fix

* minor fix input names

* Actually fix identity normalization

* style

* Added token type ids to fast tokenizer

* style

* flake fix

* fix copies

Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
This commit is contained in:
Tanmay Laud
2021-05-10 00:01:23 -07:00
committed by GitHub
parent 80da304a0f
commit f7f872955d
9 changed files with 325 additions and 10 deletions

View File

@@ -17,9 +17,9 @@
import os
import unittest
from transformers import BigBirdTokenizer
from transformers import BigBirdTokenizer, BigBirdTokenizerFast
from transformers.file_utils import cached_property
from transformers.testing_utils import require_sentencepiece, require_torch, slow
from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow
from .test_tokenization_common import TokenizerTesterMixin
@@ -30,9 +30,12 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture
@require_sentencepiece
@require_tokenizers
class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BigBirdTokenizer
rust_tokenizer_class = BigBirdTokenizerFast
test_rust_tokenizer = True
def setUp(self):
super().setUp()
@@ -40,6 +43,28 @@ class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer = BigBirdTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname)
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
return
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
sequence = "I was born in 92000, and this is falsé."
tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)
ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)
rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)
def test_full_tokenizer(self):
tokenizer = BigBirdTokenizer(SAMPLE_VOCAB, keep_accents=True)