[tokenizers] Updates data processors, docstring, examples and model cards to the new API (#5308)
* remove references to old API in docstring - update data processors * style * fix tests - better type checking error messages * better type checking * include awesome fix by @LysandreJik for #5310 * updated doc and examples
This commit is contained in:
@@ -1583,6 +1583,42 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
If the sequences are provided as list of strings (pretokenized), you must set `is_pretokenized=True`
|
||||
(to lift the ambiguity with a batch of sequences)
|
||||
"""
|
||||
# Input type checking for clearer error
|
||||
assert isinstance(text, str) or (
|
||||
isinstance(text, (list, tuple))
|
||||
and (
|
||||
len(text) == 0
|
||||
or (
|
||||
isinstance(text[0], str)
|
||||
or (isinstance(text[0], (list, tuple)) and (len(text[0]) == 0 or isinstance(text[0][0], str)))
|
||||
)
|
||||
)
|
||||
), (
|
||||
"text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
|
||||
"or `List[List[str]]` (batch of pretokenized examples)."
|
||||
)
|
||||
|
||||
assert (
|
||||
text_pair is None
|
||||
or isinstance(text_pair, str)
|
||||
or (
|
||||
isinstance(text_pair, (list, tuple))
|
||||
and (
|
||||
len(text_pair) == 0
|
||||
or (
|
||||
isinstance(text_pair[0], str)
|
||||
or (
|
||||
isinstance(text_pair[0], (list, tuple))
|
||||
and (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str))
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
), (
|
||||
"text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
|
||||
"or `List[List[str]]` (batch of pretokenized examples)."
|
||||
)
|
||||
|
||||
is_batched = bool(
|
||||
(not is_pretokenized and isinstance(text, (list, tuple)))
|
||||
or (is_pretokenized and isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple)))
|
||||
|
||||
Reference in New Issue
Block a user