Various tokenizers fixes (#5558)

* BertTokenizerFast - Do not specify strip_accents by default

* Bump tokenizers to new version

* Add test for AddedToken serialization
This commit is contained in:
Anthony MOI
2020-07-06 18:27:53 -04:00
committed by GitHub
parent 21f28c34b7
commit 5787e4c159
4 changed files with 42 additions and 25 deletions

View File

@@ -24,6 +24,7 @@ from typing import TYPE_CHECKING, Dict, List, Tuple, Union
from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast
from transformers.testing_utils import require_tf, require_torch, slow
from transformers.tokenization_utils import AddedToken
if TYPE_CHECKING:
@@ -233,6 +234,12 @@ class TokenizerTesterMixin:
self.assertListEqual(subwords, subwords_loaded)
def test_pickle_added_tokens(self):
tok1 = AddedToken("<s>", rstrip=True, lstrip=True, normalized=False, single_word=True)
tok2 = pickle.loads(pickle.dumps(tok1))
self.assertEqual(tok1.__getstate__(), tok2.__getstate__())
def test_added_tokens_do_lower_case(self):
# TODO(thom) activate fast tokenizer tests once Rust tokenizers accepts white spaces in added tokens
tokenizers = self.get_tokenizers(fast=False, do_lower_case=True)