wip

2019-11-19 09:49:55 -05:00
parent ea52f82455
commit 72e506b22e
6 changed files with 157 additions and 5 deletions
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -605,6 +605,10 @@ class PreTrainedTokenizer(object):
            vocabularies (BPE/SentencePieces/WordPieces).

            Take care of added tokens.
+
+            text: The sequence to be encoded.
+            return_tokens_mapped_to_origin: (optional) Set to True to return the index of each token in the initial whitespace tokenization. (default False).
+            **kwargs: passed to the child `self.tokenize()` method
        """
        def split_on_token(tok, text):
            result = []