Tokenizer.batch_decode convenience method (#4159)
This commit is contained in:
@@ -124,9 +124,6 @@ class MarianTokenizer(PreTrainedTokenizer):
|
||||
# We don't expect to process pairs, but leave the pair logic for API consistency
|
||||
return token_ids_0 + token_ids_1 + [self.eos_token_id]
|
||||
|
||||
def batch_decode(self, token_ids, **kwargs) -> List[str]:
|
||||
return [self.decode(ids, **kwargs) for ids in token_ids]
|
||||
|
||||
def prepare_translation_batch(
|
||||
self,
|
||||
src_texts: List[str],
|
||||
|
||||
@@ -2183,6 +2183,9 @@ class PreTrainedTokenizer(SpecialTokensMixin):
|
||||
else:
|
||||
return text
|
||||
|
||||
def batch_decode(self, sequences: List[List[int]], **kwargs) -> List[str]:
|
||||
return [self.decode(seq, **kwargs) for seq in sequences]
|
||||
|
||||
@staticmethod
|
||||
def clean_up_tokenization(out_string: str) -> str:
|
||||
""" Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms.
|
||||
|
||||
Reference in New Issue
Block a user