[tokenizers] Updates data processors, docstring, examples and model cards to the new API (#5308)

* remove references to old API in docstring - update data processors * style * fix tests - better type checking error messages * better type checking * include awesome fix by @LysandreJik for #5310 * updated doc and examples
2020-06-26 19:48:14 +02:00
parent fd405e9a93
commit 601d4d699c
73 changed files with 180 additions and 138 deletions
--- a/notebooks/02-transformers.ipynb
+++ b/notebooks/02-transformers.ipynb
@@ -255,7 +255,7 @@
    "# tokens_pt = torch.tensor([tokens_ids])\n",
    "\n",
    "# This code can be factored into one-line as follow\n",
-    "tokens_pt2 = tokenizer.encode_plus(\"This is an input example\", return_tensors=\"pt\")\n",
+    "tokens_pt2 = tokenizer(\"This is an input example\", return_tensors=\"pt\")\n",
    "\n",
    "for key, value in tokens_pt2.items():\n",
    "    print(\"{}:\\n\\t{}\".format(key, value))\n",
@@ -268,7 +268,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "As you can see above, the method `encode_plus` provides a convenient way to generate all the required parameters\n",
+    "As you can see above, calling the tokenizer provides a convenient way to generate all the required parameters\n",
    "that will go through the model. \n",
    "\n",
    "Moreover, you might have noticed it generated some additional tensors: \n",
@@ -302,10 +302,10 @@
   ],
   "source": [
    "# Single segment input\n",
-    "single_seg_input = tokenizer.encode_plus(\"This is a sample input\")\n",
+    "single_seg_input = tokenizer(\"This is a sample input\")\n",
    "\n",
    "# Multiple segment input\n",
-    "multi_seg_input = tokenizer.encode_plus(\"This is segment A\", \"This is segment B\")\n",
+    "multi_seg_input = tokenizer(\"This is segment A\", \"This is segment B\")\n",
    "\n",
    "print(\"Single segment token (str): {}\".format(tokenizer.convert_ids_to_tokens(single_seg_input['input_ids'])))\n",
    "print(\"Single segment token (int): {}\".format(single_seg_input['input_ids']))\n",
@@ -344,9 +344,9 @@
   ],
   "source": [
    "# Padding highlight\n",
-    "tokens = tokenizer.batch_encode_plus(\n",
+    "tokens = tokenizer(\n",
    "    [\"This is a sample\", \"This is another longer sample text\"], \n",
-    "    pad_to_max_length=True  # First sentence will have some PADDED tokens to match second sequence length\n",
+    "    padding=True  # First sentence will have some PADDED tokens to match second sequence length\n",
    ")\n",
    "\n",
    "for i in range(2):\n",
@@ -405,8 +405,8 @@
   ],
   "source": [
    "# transformers generates a ready to use dictionary with all the required parameters for the specific framework.\n",
-    "input_tf = tokenizer.encode_plus(\"This is a sample input\", return_tensors=\"tf\")\n",
-    "input_pt = tokenizer.encode_plus(\"This is a sample input\", return_tensors=\"pt\")\n",
+    "input_tf = tokenizer(\"This is a sample input\", return_tensors=\"tf\")\n",
+    "input_pt = tokenizer(\"This is a sample input\", return_tensors=\"pt\")\n",
    "\n",
    "# Let's compare the outputs\n",
    "output_tf, output_pt = model_tf(input_tf), model_pt(**input_pt)\n",
@@ -464,7 +464,7 @@
    "from transformers import DistilBertModel\n",
    "\n",
    "bert_distil = DistilBertModel.from_pretrained('distilbert-base-cased')\n",
-    "input_pt = tokenizer.encode_plus(\n",
+    "input_pt = tokenizer(\n",
    "    'This is a sample input to demonstrate performance of distiled models especially inference time', \n",
    "    return_tensors=\"pt\"\n",
    ")\n",
@@ -514,7 +514,7 @@
    "de_bert = BertModel.from_pretrained(\"dbmdz/bert-base-german-cased\")\n",
    "de_tokenizer = BertTokenizer.from_pretrained(\"dbmdz/bert-base-german-cased\")\n",
    "\n",
-    "de_input = de_tokenizer.encode_plus(\n",
+    "de_input = de_tokenizer(\n",
    "    \"Hugging Face ist eine französische Firma mit Sitz in New-York.\",\n",
    "    return_tensors=\"pt\"\n",
    ")\n",
@@ -559,4 +559,4 @@
 },
 "nbformat": 4,
 "nbformat_minor": 4
-}
+}
--- a/notebooks/04-onnx-export.ipynb
+++ b/notebooks/04-onnx-export.ipynb
@@ -248,7 +248,7 @@
    "cpu_model = create_model_for_provider(\"onnx/bert-base-cased.onnx\", \"CPUExecutionProvider\")\n",
    "\n",
    "# Inputs are provided through numpy array\n",
-    "model_inputs = tokenizer.encode_plus(\"My name is Bert\", return_tensors=\"pt\")\n",
+    "model_inputs = tokenizer(\"My name is Bert\", return_tensors=\"pt\")\n",
    "inputs_onnx = {k: v.cpu().detach().numpy() for k, v in model_inputs.items()}\n",
    "\n",
    "# Run the model (None = get all the outputs)\n",