From cc360649606f1a0105c9d465a2522a454746894f Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 14 Oct 2021 10:54:20 +0200 Subject: [PATCH] up (#13988) --- .../models/byt5/tokenization_byt5.py | 2 +- tests/test_tokenization_byt5.py | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/byt5/tokenization_byt5.py b/src/transformers/models/byt5/tokenization_byt5.py index bda3313f8c..4714dbd27e 100644 --- a/src/transformers/models/byt5/tokenization_byt5.py +++ b/src/transformers/models/byt5/tokenization_byt5.py @@ -237,7 +237,7 @@ class ByT5Tokenizer(PreTrainedTokenizer): else: tok_string = bytes([ord(token)]) bstring += tok_string - string = bstring.decode("utf-8") + string = bstring.decode("utf-8", errors="ignore") return string # ByT5Tokenizer has no vocab file diff --git a/tests/test_tokenization_byt5.py b/tests/test_tokenization_byt5.py index 003e6bd51f..f241d07d92 100644 --- a/tests/test_tokenization_byt5.py +++ b/tests/test_tokenization_byt5.py @@ -290,6 +290,22 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ), ) + def test_decode_single_bytes(self): + tokenizer_list = [] + if self.test_slow_tokenizer: + tokenizer_list.append((self.tokenizer_class, self.get_tokenizer())) + + if self.test_rust_tokenizer: + tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer())) + + for tokenizer_class, tokenizer_utils in tokenizer_list: + with tempfile.TemporaryDirectory() as tmp_dir: + tokenizer_utils.save_pretrained(tmp_dir) + + tokenizer = tokenizer_class.from_pretrained(tmp_dir) + + self.assertTrue(tokenizer.decode([255]) == "") + # tokenizer can be instantiated without any pretrained files, so no need for pretrained tokenizer list def test_pretrained_model_lists(self): pass