Fix + Test (#8049)

2020-10-26 16:32:27 +00:00
parent 664c7ec453
commit cbad90d86d
2 changed files with 12 additions and 0 deletions
--- a/src/transformers/tokenization_blenderbot.py
+++ b/src/transformers/tokenization_blenderbot.py
@@ -166,6 +166,9 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer):
        tokens = token.split(" ")
        words = []
        for token in tokens:
            if not len(token):
                continue
            token = token.lower()
            word = tuple(token)
            word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
--- a/tests/test_tokenization_blenderbot.py
+++ b/tests/test_tokenization_blenderbot.py
@@ -75,6 +75,15 @@ class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
        assert src_text != decoded  # I wish it did!
        assert decoded == "i am a small frog ."
    def test_empty_word_small_tok(self):
        tok = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot-90M")
        src_text = "I am a small frog ."
        src_text_dot = "."
        encoded = tok(src_text)["input_ids"]
        encoded_dot = tok(src_text_dot)["input_ids"]
        assert encoded[-1] == encoded_dot[0]
 class Blenderbot3BTokenizerTests(unittest.TestCase):
    @cached_property