Fix + Test (#8049)
This commit is contained in:
@@ -166,6 +166,9 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer):
|
|||||||
tokens = token.split(" ")
|
tokens = token.split(" ")
|
||||||
words = []
|
words = []
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
|
if not len(token):
|
||||||
|
continue
|
||||||
|
|
||||||
token = token.lower()
|
token = token.lower()
|
||||||
word = tuple(token)
|
word = tuple(token)
|
||||||
word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
|
word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
|
||||||
|
|||||||
@@ -75,6 +75,15 @@ class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
assert src_text != decoded # I wish it did!
|
assert src_text != decoded # I wish it did!
|
||||||
assert decoded == "i am a small frog ."
|
assert decoded == "i am a small frog ."
|
||||||
|
|
||||||
|
def test_empty_word_small_tok(self):
|
||||||
|
tok = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot-90M")
|
||||||
|
src_text = "I am a small frog ."
|
||||||
|
src_text_dot = "."
|
||||||
|
encoded = tok(src_text)["input_ids"]
|
||||||
|
encoded_dot = tok(src_text_dot)["input_ids"]
|
||||||
|
|
||||||
|
assert encoded[-1] == encoded_dot[0]
|
||||||
|
|
||||||
|
|
||||||
class Blenderbot3BTokenizerTests(unittest.TestCase):
|
class Blenderbot3BTokenizerTests(unittest.TestCase):
|
||||||
@cached_property
|
@cached_property
|
||||||
|
|||||||
Reference in New Issue
Block a user