From b0ad06951708b782e45b02a4d092f6fcde68a9b9 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 7 Apr 2020 22:26:31 +0200 Subject: [PATCH] [Tokenization] fix edge case for bert tokenization (#3517) * fix egde gase for bert tokenization * add Lysandres comments for improvement * use new is_pretokenized_flag --- src/transformers/tokenization_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index 1e6ba253aa..b45cbd2ea0 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -1396,7 +1396,7 @@ class PreTrainedTokenizer(SpecialTokensMixin): input_ids = [] for ids_or_pair_ids in batch_text_or_text_pairs: - if isinstance(ids_or_pair_ids, (list, tuple)) and len(ids_or_pair_ids) == 2: + if isinstance(ids_or_pair_ids, (list, tuple)) and len(ids_or_pair_ids) == 2 and not is_pretokenized: ids, pair_ids = ids_or_pair_ids else: ids, pair_ids = ids_or_pair_ids, None