Tokenizer.batch_decode convenience method (#4159)
This commit is contained in:
@@ -124,9 +124,6 @@ class MarianTokenizer(PreTrainedTokenizer):
|
|||||||
# We don't expect to process pairs, but leave the pair logic for API consistency
|
# We don't expect to process pairs, but leave the pair logic for API consistency
|
||||||
return token_ids_0 + token_ids_1 + [self.eos_token_id]
|
return token_ids_0 + token_ids_1 + [self.eos_token_id]
|
||||||
|
|
||||||
def batch_decode(self, token_ids, **kwargs) -> List[str]:
|
|
||||||
return [self.decode(ids, **kwargs) for ids in token_ids]
|
|
||||||
|
|
||||||
def prepare_translation_batch(
|
def prepare_translation_batch(
|
||||||
self,
|
self,
|
||||||
src_texts: List[str],
|
src_texts: List[str],
|
||||||
|
|||||||
@@ -2183,6 +2183,9 @@ class PreTrainedTokenizer(SpecialTokensMixin):
|
|||||||
else:
|
else:
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
def batch_decode(self, sequences: List[List[int]], **kwargs) -> List[str]:
|
||||||
|
return [self.decode(seq, **kwargs) for seq in sequences]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def clean_up_tokenization(out_string: str) -> str:
|
def clean_up_tokenization(out_string: str) -> str:
|
||||||
""" Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms.
|
""" Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms.
|
||||||
|
|||||||
Reference in New Issue
Block a user