[tokenizers] Updates data processors, docstring, examples and model cards to the new API (#5308)

* remove references to old API in docstring - update data processors

* style

* fix tests - better type checking error messages

* better type checking

* include awesome fix by @LysandreJik for #5310

* updated doc and examples
This commit is contained in:
Thomas Wolf
2020-06-26 19:48:14 +02:00
committed by GitHub
parent fd405e9a93
commit 601d4d699c
73 changed files with 180 additions and 138 deletions

View File

@@ -1583,6 +1583,42 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
If the sequences are provided as list of strings (pretokenized), you must set `is_pretokenized=True`
(to lift the ambiguity with a batch of sequences)
"""
# Input type checking for clearer error
assert isinstance(text, str) or (
isinstance(text, (list, tuple))
and (
len(text) == 0
or (
isinstance(text[0], str)
or (isinstance(text[0], (list, tuple)) and (len(text[0]) == 0 or isinstance(text[0][0], str)))
)
)
), (
"text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
"or `List[List[str]]` (batch of pretokenized examples)."
)
assert (
text_pair is None
or isinstance(text_pair, str)
or (
isinstance(text_pair, (list, tuple))
and (
len(text_pair) == 0
or (
isinstance(text_pair[0], str)
or (
isinstance(text_pair[0], (list, tuple))
and (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str))
)
)
)
)
), (
"text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
"or `List[List[str]]` (batch of pretokenized examples)."
)
is_batched = bool(
(not is_pretokenized and isinstance(text, (list, tuple)))
or (is_pretokenized and isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple)))