overflowing_tokens do not really make sense here, let's just return a number
Co-Authored-By: Lysandre Debut <lysandre.debut@reseau.eseo.fr>
This commit is contained in:
@@ -338,7 +338,7 @@ def convert_examples_to_features(
|
|||||||
max_length=max_length,
|
max_length=max_length,
|
||||||
truncate_both_sequences=True
|
truncate_both_sequences=True
|
||||||
)
|
)
|
||||||
if 'overflowing_tokens' in inputs and len(inputs['overflowing_tokens']) > 0:
|
if 'num_truncated_tokens' in inputs and inputs['num_truncated_tokens'] > 0:
|
||||||
logger.info('Attention! you are cropping tokens (swag task is ok). '
|
logger.info('Attention! you are cropping tokens (swag task is ok). '
|
||||||
'If you are training ARC and RACE and you are poping question + options,'
|
'If you are training ARC and RACE and you are poping question + options,'
|
||||||
'you need to try to use a bigger max seq length!')
|
'you need to try to use a bigger max seq length!')
|
||||||
|
|||||||
@@ -837,7 +837,8 @@ class PreTrainedTokenizer(object):
|
|||||||
copy.deepcopy(pair_ids),
|
copy.deepcopy(pair_ids),
|
||||||
max_length=max_length - n_added_tokens
|
max_length=max_length - n_added_tokens
|
||||||
)
|
)
|
||||||
encoded_inputs["overflowing_tokens"] = ids[- (len_ids - len(tokens_a)):] + pair_ids[- (len_pair_ids - len(tokens_b)):]
|
truncated_tokens = ids[- (len_ids - len(tokens_a)):] + pair_ids[- (len_pair_ids - len(tokens_b)):]
|
||||||
|
encoded_inputs["num_truncated_tokens"] = len(truncated_tokens)
|
||||||
ids = tokens_a
|
ids = tokens_a
|
||||||
pair_ids = tokens_b
|
pair_ids = tokens_b
|
||||||
elif pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length:
|
elif pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length:
|
||||||
|
|||||||
Reference in New Issue
Block a user