This commit is contained in:
Lysandre
2019-11-19 09:49:55 -05:00
committed by LysandreJik
parent ea52f82455
commit 72e506b22e
6 changed files with 157 additions and 5 deletions

View File

@@ -605,6 +605,10 @@ class PreTrainedTokenizer(object):
vocabularies (BPE/SentencePieces/WordPieces).
Take care of added tokens.
text: The sequence to be encoded.
return_tokens_mapped_to_origin: (optional) Set to True to return the index of each token in the initial whitespace tokenization. (default False).
**kwargs: passed to the child `self.tokenize()` method
"""
def split_on_token(tok, text):
result = []