[Tokenization] fix edge case for bert tokenization (#3517)
* fix egde gase for bert tokenization * add Lysandres comments for improvement * use new is_pretokenized_flag
This commit is contained in:
committed by
GitHub
parent
80fa0f7812
commit
b0ad069517
@@ -1396,7 +1396,7 @@ class PreTrainedTokenizer(SpecialTokensMixin):
|
||||
|
||||
input_ids = []
|
||||
for ids_or_pair_ids in batch_text_or_text_pairs:
|
||||
if isinstance(ids_or_pair_ids, (list, tuple)) and len(ids_or_pair_ids) == 2:
|
||||
if isinstance(ids_or_pair_ids, (list, tuple)) and len(ids_or_pair_ids) == 2 and not is_pretokenized:
|
||||
ids, pair_ids = ids_or_pair_ids
|
||||
else:
|
||||
ids, pair_ids = ids_or_pair_ids, None
|
||||
|
||||
Reference in New Issue
Block a user