Merge branch 'master' into do_lower_case
This commit is contained in:
@@ -21,6 +21,7 @@ import os
|
||||
import json
|
||||
import six
|
||||
import copy
|
||||
import itertools
|
||||
from io import open
|
||||
|
||||
from .file_utils import cached_path, is_tf_available, is_torch_available
|
||||
@@ -646,9 +647,9 @@ class PreTrainedTokenizer(object):
|
||||
tokenized_text += [sub_text]
|
||||
text_list = tokenized_text
|
||||
|
||||
return sum((self._tokenize(token, **kwargs) if token not \
|
||||
return list(itertools.chain.from_iterable((self._tokenize(token, **kwargs) if token not \
|
||||
in self.added_tokens_encoder and token not in self.all_special_tokens \
|
||||
else [token] for token in tokenized_text), [])
|
||||
else [token] for token in tokenized_text)))
|
||||
|
||||
added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens
|
||||
tokenized_text = split_on_tokens(added_tokens, text)
|
||||
@@ -676,10 +677,6 @@ class PreTrainedTokenizer(object):
|
||||
ids = []
|
||||
for token in tokens:
|
||||
ids.append(self._convert_token_to_id_with_added_voc(token))
|
||||
if len(ids) > self.max_len:
|
||||
logger.warning("Token indices sequence length is longer than the specified maximum sequence length "
|
||||
"for this model ({} > {}). Running this sequence through the model will result in "
|
||||
"indexing errors".format(len(ids), self.max_len))
|
||||
return ids
|
||||
|
||||
def _convert_token_to_id_with_added_voc(self, token):
|
||||
@@ -882,6 +879,11 @@ class PreTrainedTokenizer(object):
|
||||
encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length]
|
||||
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length]
|
||||
|
||||
if max_length is None and len(encoded_inputs["input_ids"]) > self.max_len:
|
||||
logger.warning("Token indices sequence length is longer than the specified maximum sequence length "
|
||||
"for this model ({} > {}). Running this sequence through the model will result in "
|
||||
"indexing errors".format(len(ids), self.max_len))
|
||||
|
||||
return encoded_inputs
|
||||
|
||||
def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0):
|
||||
@@ -1060,7 +1062,7 @@ class PreTrainedTokenizer(object):
|
||||
class attributes (cls_token, unk_token...).
|
||||
"""
|
||||
all_toks = self.all_special_tokens
|
||||
all_ids = list(self._convert_token_to_id(t) for t in all_toks)
|
||||
all_ids = self.convert_tokens_to_ids(all_toks)
|
||||
return all_ids
|
||||
|
||||
@staticmethod
|
||||
|
||||
Reference in New Issue
Block a user