XLMR tokenizer is fully picklable (#13577)

* made tokenizer fully picklable

* remove whitespace

* added testcase
This commit is contained in:
Benjamin Davidson
2021-09-16 21:30:05 +01:00
committed by GitHub
parent af5c6ae5ed
commit e02ed0ee7e
2 changed files with 12 additions and 1 deletions

View File

@@ -14,6 +14,9 @@
# limitations under the License.
import os
import pickle
import shutil
import tempfile
import unittest
from transformers import SPIECE_UNDERLINE, XLMRobertaTokenizer, XLMRobertaTokenizerFast
@@ -141,6 +144,13 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def big_tokenizer(self):
return XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
def test_picklable_without_disk(self):
with tempfile.NamedTemporaryFile() as f:
shutil.copyfile(SAMPLE_VOCAB, f.name)
tokenizer = XLMRobertaTokenizer(f.name, keep_accents=True)
pickled_tokenizer = pickle.dumps(tokenizer)
pickle.loads(pickled_tokenizer)
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
return