update encode_plus - add truncation strategies

This commit is contained in:
thomwolf
2019-10-04 17:38:38 -04:00
parent 92c0f2fb90
commit 6c1d0bc066
12 changed files with 222 additions and 193 deletions

View File

@@ -538,15 +538,9 @@ class PreTrainedTokenizer(object):
Returns:
Number of tokens added to sequences
"""
if pair:
initial_tokens_len = len(self.encode("This is a sequence") + self.encode("This is another"))
final_tokens_len = len(self.encode("This is a sequence", "This is another", add_special_tokens=True))
else:
initial_tokens_len = len(self.encode("This is a sequence"))
final_tokens_len = len(self.encode("This is a sequence", add_special_tokens=True))
return final_tokens_len - initial_tokens_len
token_ids_0 = []
token_ids_1 = []
return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
def add_special_tokens(self, special_tokens_dict):
"""
@@ -795,7 +789,7 @@ class PreTrainedTokenizer(object):
return_tensors=return_tensors)
def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0,
truncate_first_sequence=True, truncate_both_sequences=True, return_tensors=None):
truncation_strategy='longest_first', return_tensors=None):
"""
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
It adds special tokens, truncates
@@ -812,6 +806,12 @@ class PreTrainedTokenizer(object):
to their model.
stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
list of inputs.
truncation_strategy: string selected in the following options:
- 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
starting from the longest one at each token (when there is a pair of input sequences)
- 'only_first': Only truncate the first sequence
- 'only_second': Only truncate the second sequence
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
truncate_first_sequence: if set to `True` and an optional second list of input ids is provided,
alongside a specified `max_length`, will truncate the first sequence if the total size is superior
than the specified `max_length`. If set to `False`, will truncate the second sequence instead.
@@ -840,37 +840,17 @@ class PreTrainedTokenizer(object):
len_pair_ids = len(pair_ids) if pair else 0
encoded_inputs = {}
if max_length:
n_added_tokens = self.num_added_tokens(pair=pair) if add_special_tokens else 0
if n_added_tokens + len_ids + len_pair_ids > max_length:
if truncate_both_sequences:
tokens_a, tokens_b = self._truncate_seq_pair(
copy.deepcopy(ids),
copy.deepcopy(pair_ids),
max_length=max_length - n_added_tokens
)
truncated_tokens = ids[- (len_ids - len(tokens_a)):] + pair_ids[- (len_pair_ids - len(tokens_b)):]
encoded_inputs["num_truncated_tokens"] = len(truncated_tokens)
ids = tokens_a
pair_ids = tokens_b
elif pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length:
logger.warning(
"You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length."
"This pair of sequences will not be truncated.")
elif truncate_first_sequence or not pair:
encoded_inputs["overflowing_tokens"] = ids[max_length - len_pair_ids - n_added_tokens - stride:]
ids = ids[:max_length - len_pair_ids - n_added_tokens]
elif not truncate_first_sequence and pair:
encoded_inputs["overflowing_tokens"] = pair_ids[max_length - len_ids - n_added_tokens - stride:]
pair_ids = pair_ids[:max_length - len_ids - n_added_tokens]
else:
logger.warning(
"Cannot truncate second sequence as it is not provided. No truncation.")
total_len = len_ids + len_pair_ids + (self.num_added_tokens(pair=pair) if add_special_tokens else 0)
if max_length and total_len > max_length:
ids, pair_ids, overflowing_tokens = self.truncate_sequences(ids, pair_ids=pair_ids,
num_tokens_to_remove=total_len-max_length,
truncation_strategy=truncation_strategy)
encoded_inputs["overflowing_tokens"] = overflowing_tokens
encoded_inputs["num_truncated_tokens"] = total_len - max_length
if add_special_tokens:
sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids)
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence)
sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
else:
sequence = ids + pair_ids if pair else ids
@@ -895,39 +875,76 @@ class PreTrainedTokenizer(object):
return encoded_inputs
def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first'):
"""Truncates a sequence pair in place to the maximum length.
truncation_strategy: string selected in the following options:
- 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
starting from the longest one at each token (when there is a pair of input sequences).
Overflowing tokens only contains overflow from the first sequence.
- 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove.
- 'only_second': Only truncate the second sequence
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
"""
if num_tokens_to_remove <= 0:
return ids, pair_ids, []
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
if truncation_strategy == 'longest_first':
overflowing_tokens = []
for _ in range(num_tokens_to_remove):
if pair_ids is None or len(ids) > len(pair_ids):
overflowing_tokens.append(ids[-1])
ids = ids[:-1]
else:
pair_ids = pair_ids[:-1]
elif truncation_strategy == 'only_first':
assert len(ids) > num_tokens_to_remove
overflowing_tokens = ids[-num_tokens_to_remove:]
ids = ids[:-num_tokens_to_remove]
elif truncation_strategy == 'only_second':
assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove
overflowing_tokens = pair_ids[-num_tokens_to_remove:]
pair_ids = pair_ids[:-num_tokens_to_remove]
elif truncation_strategy == 'do_not_truncate':
raise ValueError("Input sequence are too long for max_length. Please select a truncation strategy.")
else:
raise ValueError("Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']")
return (ids, pair_ids, overflowing_tokens)
# However, since we'd better not to remove tokens of options and questions, you can choose to use a bigger
# length or only pop from context
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
return (tokens_a, tokens_b)
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
logger.warning("This tokenizer does not make use of special tokens.")
if token_ids_1 is None:
return len(token_ids_0) * [0]
return [0] * len(token_ids_0) + [1] * len(token_ids_1)
def add_special_tokens_single_sequence(self, token_ids):
logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.")
return token_ids
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.")
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A RoBERTa sequence has the following format:
single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s>
"""
logger.warning("This tokenizer does not make use of special tokens. Input is returned with no modification.")
if token_ids_1 is None:
return token_ids_0
return token_ids_0 + token_ids_1
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, special_tokens_present=False):
return [1] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
already_has_special_tokens: (default False) Set to True if the token list is already formated with
special tokens for the model
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
""" Converts a single index or a sequence of indices (integers) in a token "