[tokenizers] Updates data processors, docstring, examples and model cards to the new API (#5308)

* remove references to old API in docstring - update data processors * style * fix tests - better type checking error messages * better type checking * include awesome fix by @LysandreJik for #5310 * updated doc and examples
2020-06-26 19:48:14 +02:00
parent fd405e9a93
commit 601d4d699c
73 changed files with 180 additions and 138 deletions
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1583,6 +1583,42 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
                If the sequences are provided as list of strings (pretokenized), you must set `is_pretokenized=True`
                (to lift the ambiguity with a batch of sequences)
        """
+        # Input type checking for clearer error
+        assert isinstance(text, str) or (
+            isinstance(text, (list, tuple))
+            and (
+                len(text) == 0
+                or (
+                    isinstance(text[0], str)
+                    or (isinstance(text[0], (list, tuple)) and (len(text[0]) == 0 or isinstance(text[0][0], str)))
+                )
+            )
+        ), (
+            "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
+            "or `List[List[str]]` (batch of pretokenized examples)."
+        )
+
+        assert (
+            text_pair is None
+            or isinstance(text_pair, str)
+            or (
+                isinstance(text_pair, (list, tuple))
+                and (
+                    len(text_pair) == 0
+                    or (
+                        isinstance(text_pair[0], str)
+                        or (
+                            isinstance(text_pair[0], (list, tuple))
+                            and (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str))
+                        )
+                    )
+                )
+            )
+        ), (
+            "text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
+            "or `List[List[str]]` (batch of pretokenized examples)."
+        )
+
        is_batched = bool(
            (not is_pretokenized and isinstance(text, (list, tuple)))
            or (is_pretokenized and isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple)))