Allow usage of TF Text BertTokenizer on TFBertTokenizer to make it servable on TF Serving (#19590)

* add suport for non fast tf bert tokenizer

* add tests for non fast tf bert tokenizer

* fix fast bert tf tokenizer flag

* double tokenizers list on tf tokenizers test to aovid breaking zip on test output equivalence

* reformat code with black to comply with code quality checks

* trigger ci
This commit is contained in:
Pi Esposito
2022-10-14 11:18:02 -03:00
committed by GitHub
parent 59b7334c87
commit 0e0b7cb72a
2 changed files with 31 additions and 6 deletions

View File

@@ -40,8 +40,15 @@ class BertTokenizationTest(unittest.TestCase):
def setUp(self):
super().setUp()
self.tokenizers = [BertTokenizer.from_pretrained(checkpoint) for checkpoint in TOKENIZER_CHECKPOINTS]
self.tf_tokenizers = [TFBertTokenizer.from_pretrained(checkpoint) for checkpoint in TOKENIZER_CHECKPOINTS]
self.tokenizers = [
BertTokenizer.from_pretrained(checkpoint) for checkpoint in (TOKENIZER_CHECKPOINTS * 2)
] # repeat for when fast_bert_tokenizer=false
self.tf_tokenizers = [TFBertTokenizer.from_pretrained(checkpoint) for checkpoint in TOKENIZER_CHECKPOINTS] + [
TFBertTokenizer.from_pretrained(checkpoint, use_fast_bert_tokenizer=False)
for checkpoint in TOKENIZER_CHECKPOINTS
]
assert len(self.tokenizers) == len(self.tf_tokenizers)
self.test_sentences = [
"This is a straightforward English test sentence.",
"This one has some weird characters\rto\nsee\r\nif those\u00E9break things.",