From cbad90d86dadddc040bcf0b369f2da5334fd53bd Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Mon, 26 Oct 2020 16:32:27 +0000 Subject: [PATCH] Fix + Test (#8049) --- src/transformers/tokenization_blenderbot.py | 3 +++ tests/test_tokenization_blenderbot.py | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/src/transformers/tokenization_blenderbot.py b/src/transformers/tokenization_blenderbot.py index 6d3dc35666..287a3dcc52 100644 --- a/src/transformers/tokenization_blenderbot.py +++ b/src/transformers/tokenization_blenderbot.py @@ -166,6 +166,9 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer): tokens = token.split(" ") words = [] for token in tokens: + if not len(token): + continue + token = token.lower() word = tuple(token) word = tuple(list(word[:-1]) + [word[-1] + ""]) diff --git a/tests/test_tokenization_blenderbot.py b/tests/test_tokenization_blenderbot.py index e02b4546d1..fee7f19372 100644 --- a/tests/test_tokenization_blenderbot.py +++ b/tests/test_tokenization_blenderbot.py @@ -75,6 +75,15 @@ class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase): assert src_text != decoded # I wish it did! assert decoded == "i am a small frog ." + def test_empty_word_small_tok(self): + tok = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot-90M") + src_text = "I am a small frog ." + src_text_dot = "." + encoded = tok(src_text)["input_ids"] + encoded_dot = tok(src_text_dot)["input_ids"] + + assert encoded[-1] == encoded_dot[0] + class Blenderbot3BTokenizerTests(unittest.TestCase): @cached_property