Reformat to make code clearer in tokenizer call (#11497)
* Reformat to make code clearer * Reformat to make code clearer
This commit is contained in:
@@ -2236,47 +2236,42 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
|||||||
:obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
:obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||||
"""
|
"""
|
||||||
# Input type checking for clearer error
|
# Input type checking for clearer error
|
||||||
assert isinstance(text, str) or (
|
def _is_valid_text_input(t):
|
||||||
isinstance(text, (list, tuple))
|
if isinstance(t, str):
|
||||||
and (
|
# Strings are fine
|
||||||
len(text) == 0
|
return True
|
||||||
or (
|
elif isinstance(t, (list, tuple)):
|
||||||
isinstance(text[0], str)
|
# List are fine as long as they are...
|
||||||
or (isinstance(text[0], (list, tuple)) and (len(text[0]) == 0 or isinstance(text[0][0], str)))
|
if len(t) == 0:
|
||||||
)
|
# ... empty
|
||||||
)
|
return True
|
||||||
), (
|
elif isinstance(t[0], str):
|
||||||
"text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
|
# ... list of strings
|
||||||
"or `List[List[str]]` (batch of pretokenized examples)."
|
return True
|
||||||
)
|
elif isinstance(t[0], (list, tuple)):
|
||||||
|
# ... list with an empty list or with a list of strings
|
||||||
|
return len(t[0]) == 0 or isinstance(t[0][0], str)
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
assert (
|
if not _is_valid_text_input(text):
|
||||||
text_pair is None
|
raise ValueError(
|
||||||
or isinstance(text_pair, str)
|
"text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
|
||||||
or (
|
"or `List[List[str]]` (batch of pretokenized examples)."
|
||||||
isinstance(text_pair, (list, tuple))
|
|
||||||
and (
|
|
||||||
len(text_pair) == 0
|
|
||||||
or (
|
|
||||||
isinstance(text_pair[0], str)
|
|
||||||
or (
|
|
||||||
isinstance(text_pair[0], (list, tuple))
|
|
||||||
and (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str))
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
), (
|
|
||||||
"text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
|
|
||||||
"or `List[List[str]]` (batch of pretokenized examples)."
|
|
||||||
)
|
|
||||||
|
|
||||||
is_batched = bool(
|
if text_pair is not None and not _is_valid_text_input(text_pair):
|
||||||
(not is_split_into_words and isinstance(text, (list, tuple)))
|
raise ValueError(
|
||||||
or (
|
"text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
|
||||||
is_split_into_words and isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
|
"or `List[List[str]]` (batch of pretokenized examples)."
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
if is_split_into_words:
|
||||||
|
is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
|
||||||
|
else:
|
||||||
|
is_batched = isinstance(text, (list, tuple))
|
||||||
|
|
||||||
if is_batched:
|
if is_batched:
|
||||||
if isinstance(text_pair, str):
|
if isinstance(text_pair, str):
|
||||||
|
|||||||
Reference in New Issue
Block a user