overflowing_tokens do not really make sense here, let's just return a number

Co-Authored-By: Lysandre Debut <lysandre.debut@reseau.eseo.fr>
This commit is contained in:
Julien Chaumond
2019-09-30 16:37:09 -04:00
parent f5bcde0b2f
commit b350662955
2 changed files with 3 additions and 2 deletions

View File

@@ -837,7 +837,8 @@ class PreTrainedTokenizer(object):
copy.deepcopy(pair_ids),
max_length=max_length - n_added_tokens
)
encoded_inputs["overflowing_tokens"] = ids[- (len_ids - len(tokens_a)):] + pair_ids[- (len_pair_ids - len(tokens_b)):]
truncated_tokens = ids[- (len_ids - len(tokens_a)):] + pair_ids[- (len_pair_ids - len(tokens_b)):]
encoded_inputs["num_truncated_tokens"] = len(truncated_tokens)
ids = tokens_a
pair_ids = tokens_b
elif pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length: