[Tokenization] fix edge case for bert tokenization (#3517)

* fix egde gase for bert tokenization

* add Lysandres comments for improvement

* use new is_pretokenized_flag
This commit is contained in:
Patrick von Platen
2020-04-07 22:26:31 +02:00
committed by GitHub
parent 80fa0f7812
commit b0ad069517

View File

@@ -1396,7 +1396,7 @@ class PreTrainedTokenizer(SpecialTokensMixin):
input_ids = []
for ids_or_pair_ids in batch_text_or_text_pairs:
if isinstance(ids_or_pair_ids, (list, tuple)) and len(ids_or_pair_ids) == 2:
if isinstance(ids_or_pair_ids, (list, tuple)) and len(ids_or_pair_ids) == 2 and not is_pretokenized:
ids, pair_ids = ids_or_pair_ids
else:
ids, pair_ids = ids_or_pair_ids, None