From ad1f7bef13f03287af00f819605d696138a5e6ec Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Thu, 29 Apr 2021 07:51:09 -0400 Subject: [PATCH] Reformat to make code clearer in tokenizer call (#11497) * Reformat to make code clearer * Reformat to make code clearer --- src/transformers/tokenization_utils_base.py | 69 ++++++++++----------- 1 file changed, 32 insertions(+), 37 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 2d7f7d8518..eed0342566 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2236,47 +2236,42 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): :obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences). """ # Input type checking for clearer error - assert isinstance(text, str) or ( - isinstance(text, (list, tuple)) - and ( - len(text) == 0 - or ( - isinstance(text[0], str) - or (isinstance(text[0], (list, tuple)) and (len(text[0]) == 0 or isinstance(text[0][0], str))) - ) - ) - ), ( - "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " - "or `List[List[str]]` (batch of pretokenized examples)." - ) + def _is_valid_text_input(t): + if isinstance(t, str): + # Strings are fine + return True + elif isinstance(t, (list, tuple)): + # List are fine as long as they are... + if len(t) == 0: + # ... empty + return True + elif isinstance(t[0], str): + # ... list of strings + return True + elif isinstance(t[0], (list, tuple)): + # ... list with an empty list or with a list of strings + return len(t[0]) == 0 or isinstance(t[0][0], str) + else: + return False + else: + return False - assert ( - text_pair is None - or isinstance(text_pair, str) - or ( - isinstance(text_pair, (list, tuple)) - and ( - len(text_pair) == 0 - or ( - isinstance(text_pair[0], str) - or ( - isinstance(text_pair[0], (list, tuple)) - and (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str)) - ) - ) - ) + if not _is_valid_text_input(text): + raise ValueError( + "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " + "or `List[List[str]]` (batch of pretokenized examples)." ) - ), ( - "text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " - "or `List[List[str]]` (batch of pretokenized examples)." - ) - is_batched = bool( - (not is_split_into_words and isinstance(text, (list, tuple))) - or ( - is_split_into_words and isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple)) + if text_pair is not None and not _is_valid_text_input(text_pair): + raise ValueError( + "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " + "or `List[List[str]]` (batch of pretokenized examples)." ) - ) + + if is_split_into_words: + is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple)) + else: + is_batched = isinstance(text, (list, tuple)) if is_batched: if isinstance(text_pair, str):