[tokenizers] Updates data processors, docstring, examples and model cards to the new API (#5308)

* remove references to old API in docstring - update data processors

* style

* fix tests - better type checking error messages

* better type checking

* include awesome fix by @LysandreJik for #5310

* updated doc and examples
This commit is contained in:
Thomas Wolf
2020-06-26 19:48:14 +02:00
committed by GitHub
parent fd405e9a93
commit 601d4d699c
73 changed files with 180 additions and 138 deletions

View File

@@ -255,7 +255,7 @@
"# tokens_pt = torch.tensor([tokens_ids])\n",
"\n",
"# This code can be factored into one-line as follow\n",
"tokens_pt2 = tokenizer.encode_plus(\"This is an input example\", return_tensors=\"pt\")\n",
"tokens_pt2 = tokenizer(\"This is an input example\", return_tensors=\"pt\")\n",
"\n",
"for key, value in tokens_pt2.items():\n",
" print(\"{}:\\n\\t{}\".format(key, value))\n",
@@ -268,7 +268,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"As you can see above, the method `encode_plus` provides a convenient way to generate all the required parameters\n",
"As you can see above, calling the tokenizer provides a convenient way to generate all the required parameters\n",
"that will go through the model. \n",
"\n",
"Moreover, you might have noticed it generated some additional tensors: \n",
@@ -302,10 +302,10 @@
],
"source": [
"# Single segment input\n",
"single_seg_input = tokenizer.encode_plus(\"This is a sample input\")\n",
"single_seg_input = tokenizer(\"This is a sample input\")\n",
"\n",
"# Multiple segment input\n",
"multi_seg_input = tokenizer.encode_plus(\"This is segment A\", \"This is segment B\")\n",
"multi_seg_input = tokenizer(\"This is segment A\", \"This is segment B\")\n",
"\n",
"print(\"Single segment token (str): {}\".format(tokenizer.convert_ids_to_tokens(single_seg_input['input_ids'])))\n",
"print(\"Single segment token (int): {}\".format(single_seg_input['input_ids']))\n",
@@ -344,9 +344,9 @@
],
"source": [
"# Padding highlight\n",
"tokens = tokenizer.batch_encode_plus(\n",
"tokens = tokenizer(\n",
" [\"This is a sample\", \"This is another longer sample text\"], \n",
" pad_to_max_length=True # First sentence will have some PADDED tokens to match second sequence length\n",
" padding=True # First sentence will have some PADDED tokens to match second sequence length\n",
")\n",
"\n",
"for i in range(2):\n",
@@ -405,8 +405,8 @@
],
"source": [
"# transformers generates a ready to use dictionary with all the required parameters for the specific framework.\n",
"input_tf = tokenizer.encode_plus(\"This is a sample input\", return_tensors=\"tf\")\n",
"input_pt = tokenizer.encode_plus(\"This is a sample input\", return_tensors=\"pt\")\n",
"input_tf = tokenizer(\"This is a sample input\", return_tensors=\"tf\")\n",
"input_pt = tokenizer(\"This is a sample input\", return_tensors=\"pt\")\n",
"\n",
"# Let's compare the outputs\n",
"output_tf, output_pt = model_tf(input_tf), model_pt(**input_pt)\n",
@@ -464,7 +464,7 @@
"from transformers import DistilBertModel\n",
"\n",
"bert_distil = DistilBertModel.from_pretrained('distilbert-base-cased')\n",
"input_pt = tokenizer.encode_plus(\n",
"input_pt = tokenizer(\n",
" 'This is a sample input to demonstrate performance of distiled models especially inference time', \n",
" return_tensors=\"pt\"\n",
")\n",
@@ -514,7 +514,7 @@
"de_bert = BertModel.from_pretrained(\"dbmdz/bert-base-german-cased\")\n",
"de_tokenizer = BertTokenizer.from_pretrained(\"dbmdz/bert-base-german-cased\")\n",
"\n",
"de_input = de_tokenizer.encode_plus(\n",
"de_input = de_tokenizer(\n",
" \"Hugging Face ist eine französische Firma mit Sitz in New-York.\",\n",
" return_tensors=\"pt\"\n",
")\n",
@@ -559,4 +559,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
}
}

View File

@@ -248,7 +248,7 @@
"cpu_model = create_model_for_provider(\"onnx/bert-base-cased.onnx\", \"CPUExecutionProvider\")\n",
"\n",
"# Inputs are provided through numpy array\n",
"model_inputs = tokenizer.encode_plus(\"My name is Bert\", return_tensors=\"pt\")\n",
"model_inputs = tokenizer(\"My name is Bert\", return_tensors=\"pt\")\n",
"inputs_onnx = {k: v.cpu().detach().numpy() for k, v in model_inputs.items()}\n",
"\n",
"# Run the model (None = get all the outputs)\n",