[SpeechT5Tokenization] Add copied from and fix the convert_tokens_to_string to match the fast decoding scheme (#28522)
* Add copied from and fix the `convert_tokens_to_string` to match the fast decoding scheme * fixup * add a small test * style test file * nites
This commit is contained in:
@@ -251,6 +251,7 @@ class BarthezTokenizer(PreTrainedTokenizer):
|
|||||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||||
return self.sp_model.IdToPiece(index)
|
return self.sp_model.IdToPiece(index)
|
||||||
|
|
||||||
|
# Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string
|
||||||
def convert_tokens_to_string(self, tokens):
|
def convert_tokens_to_string(self, tokens):
|
||||||
"""Converts a sequence of tokens (string) in a single string."""
|
"""Converts a sequence of tokens (string) in a single string."""
|
||||||
current_sub_tokens = []
|
current_sub_tokens = []
|
||||||
|
|||||||
@@ -181,6 +181,7 @@ class BigBirdTokenizer(PreTrainedTokenizer):
|
|||||||
token = self.sp_model.IdToPiece(index)
|
token = self.sp_model.IdToPiece(index)
|
||||||
return token
|
return token
|
||||||
|
|
||||||
|
# Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string
|
||||||
def convert_tokens_to_string(self, tokens):
|
def convert_tokens_to_string(self, tokens):
|
||||||
"""Converts a sequence of tokens (string) in a single string."""
|
"""Converts a sequence of tokens (string) in a single string."""
|
||||||
current_sub_tokens = []
|
current_sub_tokens = []
|
||||||
|
|||||||
@@ -210,6 +210,7 @@ class FNetTokenizer(PreTrainedTokenizer):
|
|||||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||||
return self.sp_model.IdToPiece(index)
|
return self.sp_model.IdToPiece(index)
|
||||||
|
|
||||||
|
# Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string
|
||||||
def convert_tokens_to_string(self, tokens):
|
def convert_tokens_to_string(self, tokens):
|
||||||
"""Converts a sequence of tokens (string) in a single string."""
|
"""Converts a sequence of tokens (string) in a single string."""
|
||||||
current_sub_tokens = []
|
current_sub_tokens = []
|
||||||
|
|||||||
@@ -230,6 +230,7 @@ class MBart50Tokenizer(PreTrainedTokenizer):
|
|||||||
return self.fairseq_ids_to_tokens[index]
|
return self.fairseq_ids_to_tokens[index]
|
||||||
return self.sp_model.IdToPiece(index - self.fairseq_offset)
|
return self.sp_model.IdToPiece(index - self.fairseq_offset)
|
||||||
|
|
||||||
|
# Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string
|
||||||
def convert_tokens_to_string(self, tokens):
|
def convert_tokens_to_string(self, tokens):
|
||||||
"""Converts a sequence of tokens (string) in a single string."""
|
"""Converts a sequence of tokens (string) in a single string."""
|
||||||
current_sub_tokens = []
|
current_sub_tokens = []
|
||||||
|
|||||||
@@ -177,17 +177,23 @@ class SpeechT5Tokenizer(PreTrainedTokenizer):
|
|||||||
token = self.sp_model.IdToPiece(index)
|
token = self.sp_model.IdToPiece(index)
|
||||||
return token
|
return token
|
||||||
|
|
||||||
|
# Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string
|
||||||
def convert_tokens_to_string(self, tokens):
|
def convert_tokens_to_string(self, tokens):
|
||||||
"""Converts a sequence of tokens (string) in a single string."""
|
"""Converts a sequence of tokens (string) in a single string."""
|
||||||
current_sub_tokens = []
|
current_sub_tokens = []
|
||||||
out_string = ""
|
out_string = ""
|
||||||
|
prev_is_special = False
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
# make sure that special tokens are not decoded using sentencepiece model
|
# make sure that special tokens are not decoded using sentencepiece model
|
||||||
if token in self.all_special_tokens:
|
if token in self.all_special_tokens:
|
||||||
|
if not prev_is_special:
|
||||||
|
out_string += " "
|
||||||
out_string += self.sp_model.decode(current_sub_tokens) + token
|
out_string += self.sp_model.decode(current_sub_tokens) + token
|
||||||
|
prev_is_special = True
|
||||||
current_sub_tokens = []
|
current_sub_tokens = []
|
||||||
else:
|
else:
|
||||||
current_sub_tokens.append(token)
|
current_sub_tokens.append(token)
|
||||||
|
prev_is_special = False
|
||||||
out_string += self.sp_model.decode(current_sub_tokens)
|
out_string += self.sp_model.decode(current_sub_tokens)
|
||||||
return out_string.strip()
|
return out_string.strip()
|
||||||
|
|
||||||
|
|||||||
@@ -202,3 +202,17 @@ class SpeechT5TokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
revision="c5ef64c71905caeccde0e4462ef3f9077224c524",
|
revision="c5ef64c71905caeccde0e4462ef3f9077224c524",
|
||||||
sequences=sequences,
|
sequences=sequences,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_encode_decode(self):
|
||||||
|
tokenizer = SpeechT5Tokenizer.from_pretrained("microsoft/speecht5_tts")
|
||||||
|
|
||||||
|
tokens = tokenizer.tokenize("a = b")
|
||||||
|
self.assertEqual(tokens, ["▁", "a", "▁", "=", "▁", "b"])
|
||||||
|
|
||||||
|
# the `'='` is unknown.
|
||||||
|
ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||||
|
self.assertEqual(ids, [4, 7, 4, 3, 4, 25])
|
||||||
|
|
||||||
|
# let's make sure decoding with the special unknown tokens preserves spaces
|
||||||
|
ids = tokenizer.encode("a = b")
|
||||||
|
self.assertEqual(tokenizer.decode(ids), "a <unk> b</s>")
|
||||||
|
|||||||
Reference in New Issue
Block a user