Re-implemented tokenize() iteratively in PreTrainedTokenizer.
This commit is contained in:
@@ -472,15 +472,45 @@ class PreTrainedTokenizer(object):
|
|||||||
|
|
||||||
Take care of added tokens.
|
Take care of added tokens.
|
||||||
"""
|
"""
|
||||||
|
def split_on_token(tok, text):
|
||||||
|
result = []
|
||||||
|
split_text = text.split(tok)
|
||||||
|
for i, sub_text in enumerate(split_text):
|
||||||
|
sub_text = sub_text.strip()
|
||||||
|
if i == 0 and not sub_text:
|
||||||
|
result += [tok]
|
||||||
|
elif i == len(split_text) - 1:
|
||||||
|
if sub_text:
|
||||||
|
result += [sub_text]
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
if sub_text:
|
||||||
|
result += [sub_text]
|
||||||
|
result += [tok]
|
||||||
|
return result
|
||||||
|
|
||||||
def split_on_tokens(tok_list, text):
|
def split_on_tokens(tok_list, text):
|
||||||
if not text:
|
if not text:
|
||||||
return []
|
return []
|
||||||
if not tok_list:
|
if not tok_list:
|
||||||
return self._tokenize(text, **kwargs)
|
return self._tokenize(text, **kwargs)
|
||||||
tok = tok_list[0]
|
|
||||||
split_text = text.split(tok)
|
tokenized_text = []
|
||||||
return sum((split_on_tokens(tok_list[1:], sub_text.strip()) + [tok] \
|
text_list = [text]
|
||||||
for sub_text in split_text), [])[:-1]
|
for tok in tok_list:
|
||||||
|
tokenized_text = []
|
||||||
|
for sub_text in text_list:
|
||||||
|
if sub_text not in self.added_tokens_encoder \
|
||||||
|
and sub_text not in self.all_special_tokens:
|
||||||
|
tokenized_text += split_on_token(tok, sub_text)
|
||||||
|
else:
|
||||||
|
tokenized_text += [sub_text]
|
||||||
|
text_list = tokenized_text
|
||||||
|
|
||||||
|
return sum((self._tokenize(token, **kwargs) if token not \
|
||||||
|
in self.added_tokens_encoder and token not in self.all_special_tokens \
|
||||||
|
else [token] for token in tokenized_text), [])
|
||||||
|
|
||||||
added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens
|
added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens
|
||||||
tokenized_text = split_on_tokens(added_tokens, text)
|
tokenized_text = split_on_tokens(added_tokens, text)
|
||||||
|
|||||||
Reference in New Issue
Block a user