Fix + Test (#8049)
This commit is contained in:
@@ -166,6 +166,9 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer):
|
||||
tokens = token.split(" ")
|
||||
words = []
|
||||
for token in tokens:
|
||||
if not len(token):
|
||||
continue
|
||||
|
||||
token = token.lower()
|
||||
word = tuple(token)
|
||||
word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
|
||||
|
||||
@@ -75,6 +75,15 @@ class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
assert src_text != decoded # I wish it did!
|
||||
assert decoded == "i am a small frog ."
|
||||
|
||||
def test_empty_word_small_tok(self):
|
||||
tok = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot-90M")
|
||||
src_text = "I am a small frog ."
|
||||
src_text_dot = "."
|
||||
encoded = tok(src_text)["input_ids"]
|
||||
encoded_dot = tok(src_text_dot)["input_ids"]
|
||||
|
||||
assert encoded[-1] == encoded_dot[0]
|
||||
|
||||
|
||||
class Blenderbot3BTokenizerTests(unittest.TestCase):
|
||||
@cached_property
|
||||
|
||||
Reference in New Issue
Block a user