Various tokenizers fixes (#5558)
* BertTokenizerFast - Do not specify strip_accents by default * Bump tokenizers to new version * Add test for AddedToken serialization
This commit is contained in:
@@ -24,6 +24,7 @@ from typing import TYPE_CHECKING, Dict, List, Tuple, Union
|
||||
|
||||
from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast
|
||||
from transformers.testing_utils import require_tf, require_torch, slow
|
||||
from transformers.tokenization_utils import AddedToken
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -233,6 +234,12 @@ class TokenizerTesterMixin:
|
||||
|
||||
self.assertListEqual(subwords, subwords_loaded)
|
||||
|
||||
def test_pickle_added_tokens(self):
|
||||
tok1 = AddedToken("<s>", rstrip=True, lstrip=True, normalized=False, single_word=True)
|
||||
tok2 = pickle.loads(pickle.dumps(tok1))
|
||||
|
||||
self.assertEqual(tok1.__getstate__(), tok2.__getstate__())
|
||||
|
||||
def test_added_tokens_do_lower_case(self):
|
||||
# TODO(thom) activate fast tokenizer tests once Rust tokenizers accepts white spaces in added tokens
|
||||
tokenizers = self.get_tokenizers(fast=False, do_lower_case=True)
|
||||
|
||||
Reference in New Issue
Block a user