Merge branch 'master' into improved_testing
This commit is contained in:
@@ -222,6 +222,9 @@ class PreTrainedTokenizer(object):
|
||||
self._additional_special_tokens = []
|
||||
|
||||
self.max_len = max_len if max_len is not None else int(1e12)
|
||||
self.max_len_single_sentence = self.max_len
|
||||
self.max_len_sentences_pair = self.max_len
|
||||
|
||||
self.added_tokens_encoder = {}
|
||||
self.added_tokens_decoder = {}
|
||||
|
||||
@@ -349,7 +352,7 @@ class PreTrainedTokenizer(object):
|
||||
resolved_vocab_files[file_id] = None
|
||||
else:
|
||||
resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
|
||||
except EnvironmentError:
|
||||
except EnvironmentError as e:
|
||||
if pretrained_model_name_or_path in s3_models:
|
||||
logger.error("Couldn't reach server to download vocabulary.")
|
||||
else:
|
||||
@@ -359,7 +362,7 @@ class PreTrainedTokenizer(object):
|
||||
"at this path or url.".format(
|
||||
pretrained_model_name_or_path, ', '.join(s3_models),
|
||||
pretrained_model_name_or_path, str(vocab_files.keys())))
|
||||
return None
|
||||
raise e
|
||||
|
||||
for file_id, file_path in vocab_files.items():
|
||||
if file_path == resolved_vocab_files[file_id]:
|
||||
@@ -653,10 +656,12 @@ class PreTrainedTokenizer(object):
|
||||
return first_sentence_tokens, second_sentence_tokens
|
||||
|
||||
def add_special_tokens_single_sentence(self, token_ids):
|
||||
raise NotImplementedError
|
||||
logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.")
|
||||
return token_ids
|
||||
|
||||
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
|
||||
raise NotImplementedError
|
||||
logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.")
|
||||
return token_ids_0 + token_ids_1
|
||||
|
||||
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
|
||||
""" Converts a single index or a sequence of indices (integers) in a token "
|
||||
@@ -699,9 +704,9 @@ class PreTrainedTokenizer(object):
|
||||
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
|
||||
text = self.convert_tokens_to_string(filtered_tokens)
|
||||
|
||||
if self.sep_token is not None and self.sep_token in text:
|
||||
text = text.replace(self.cls_token, self.sep_token)
|
||||
split_text = list(filter(lambda sentence: len(sentence) > 0, text.split(self.sep_token)))
|
||||
if self._sep_token is not None and self._sep_token in text:
|
||||
text = text.replace(self._cls_token, self._sep_token)
|
||||
split_text = list(filter(lambda sentence: len(sentence) > 0, text.split(self._sep_token)))
|
||||
if clean_up_tokenization_spaces:
|
||||
clean_text = [self.clean_up_tokenization(text) for text in split_text]
|
||||
return clean_text
|
||||
|
||||
Reference in New Issue
Block a user