From 125a75a121a777d4416ce5530320897df32e7e3d Mon Sep 17 00:00:00 2001 From: Lysandre Date: Mon, 10 Feb 2020 10:47:42 -0500 Subject: [PATCH] Correctly compute tokens when padding on the left --- src/transformers/data/processors/squad.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py index f2e63e9394..6a4ff590ea 100644 --- a/src/transformers/data/processors/squad.py +++ b/src/transformers/data/processors/squad.py @@ -147,7 +147,14 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q ) if tokenizer.pad_token_id in encoded_dict["input_ids"]: - non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)] + if tokenizer.padding_side == "right": + non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)] + else: + last_padding_id_position = ( + len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index(tokenizer.pad_token_id) + ) + non_padded_ids = encoded_dict["input_ids"][last_padding_id_position + 1 :] + else: non_padded_ids = encoded_dict["input_ids"]