From b3506629551b5496aa7aa80923abade04b70f166 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Mon, 30 Sep 2019 16:37:09 -0400 Subject: [PATCH] overflowing_tokens do not really make sense here, let's just return a number Co-Authored-By: Lysandre Debut --- examples/utils_multiple_choice.py | 2 +- transformers/tokenization_utils.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/utils_multiple_choice.py b/examples/utils_multiple_choice.py index 50bb491243..a7fc1b1222 100644 --- a/examples/utils_multiple_choice.py +++ b/examples/utils_multiple_choice.py @@ -338,7 +338,7 @@ def convert_examples_to_features( max_length=max_length, truncate_both_sequences=True ) - if 'overflowing_tokens' in inputs and len(inputs['overflowing_tokens']) > 0: + if 'num_truncated_tokens' in inputs and inputs['num_truncated_tokens'] > 0: logger.info('Attention! you are cropping tokens (swag task is ok). ' 'If you are training ARC and RACE and you are poping question + options,' 'you need to try to use a bigger max seq length!') diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index ea90f33a28..27b9b0638d 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -837,7 +837,8 @@ class PreTrainedTokenizer(object): copy.deepcopy(pair_ids), max_length=max_length - n_added_tokens ) - encoded_inputs["overflowing_tokens"] = ids[- (len_ids - len(tokens_a)):] + pair_ids[- (len_pair_ids - len(tokens_b)):] + truncated_tokens = ids[- (len_ids - len(tokens_a)):] + pair_ids[- (len_pair_ids - len(tokens_b)):] + encoded_inputs["num_truncated_tokens"] = len(truncated_tokens) ids = tokens_a pair_ids = tokens_b elif pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length: