[tokenizers] Updates data processors, docstring, examples and model cards to the new API (#5308)
* remove references to old API in docstring - update data processors * style * fix tests - better type checking error messages * better type checking * include awesome fix by @LysandreJik for #5310 * updated doc and examples
This commit is contained in:
@@ -255,7 +255,7 @@
|
||||
"# tokens_pt = torch.tensor([tokens_ids])\n",
|
||||
"\n",
|
||||
"# This code can be factored into one-line as follow\n",
|
||||
"tokens_pt2 = tokenizer.encode_plus(\"This is an input example\", return_tensors=\"pt\")\n",
|
||||
"tokens_pt2 = tokenizer(\"This is an input example\", return_tensors=\"pt\")\n",
|
||||
"\n",
|
||||
"for key, value in tokens_pt2.items():\n",
|
||||
" print(\"{}:\\n\\t{}\".format(key, value))\n",
|
||||
@@ -268,7 +268,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As you can see above, the method `encode_plus` provides a convenient way to generate all the required parameters\n",
|
||||
"As you can see above, calling the tokenizer provides a convenient way to generate all the required parameters\n",
|
||||
"that will go through the model. \n",
|
||||
"\n",
|
||||
"Moreover, you might have noticed it generated some additional tensors: \n",
|
||||
@@ -302,10 +302,10 @@
|
||||
],
|
||||
"source": [
|
||||
"# Single segment input\n",
|
||||
"single_seg_input = tokenizer.encode_plus(\"This is a sample input\")\n",
|
||||
"single_seg_input = tokenizer(\"This is a sample input\")\n",
|
||||
"\n",
|
||||
"# Multiple segment input\n",
|
||||
"multi_seg_input = tokenizer.encode_plus(\"This is segment A\", \"This is segment B\")\n",
|
||||
"multi_seg_input = tokenizer(\"This is segment A\", \"This is segment B\")\n",
|
||||
"\n",
|
||||
"print(\"Single segment token (str): {}\".format(tokenizer.convert_ids_to_tokens(single_seg_input['input_ids'])))\n",
|
||||
"print(\"Single segment token (int): {}\".format(single_seg_input['input_ids']))\n",
|
||||
@@ -344,9 +344,9 @@
|
||||
],
|
||||
"source": [
|
||||
"# Padding highlight\n",
|
||||
"tokens = tokenizer.batch_encode_plus(\n",
|
||||
"tokens = tokenizer(\n",
|
||||
" [\"This is a sample\", \"This is another longer sample text\"], \n",
|
||||
" pad_to_max_length=True # First sentence will have some PADDED tokens to match second sequence length\n",
|
||||
" padding=True # First sentence will have some PADDED tokens to match second sequence length\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"for i in range(2):\n",
|
||||
@@ -405,8 +405,8 @@
|
||||
],
|
||||
"source": [
|
||||
"# transformers generates a ready to use dictionary with all the required parameters for the specific framework.\n",
|
||||
"input_tf = tokenizer.encode_plus(\"This is a sample input\", return_tensors=\"tf\")\n",
|
||||
"input_pt = tokenizer.encode_plus(\"This is a sample input\", return_tensors=\"pt\")\n",
|
||||
"input_tf = tokenizer(\"This is a sample input\", return_tensors=\"tf\")\n",
|
||||
"input_pt = tokenizer(\"This is a sample input\", return_tensors=\"pt\")\n",
|
||||
"\n",
|
||||
"# Let's compare the outputs\n",
|
||||
"output_tf, output_pt = model_tf(input_tf), model_pt(**input_pt)\n",
|
||||
@@ -464,7 +464,7 @@
|
||||
"from transformers import DistilBertModel\n",
|
||||
"\n",
|
||||
"bert_distil = DistilBertModel.from_pretrained('distilbert-base-cased')\n",
|
||||
"input_pt = tokenizer.encode_plus(\n",
|
||||
"input_pt = tokenizer(\n",
|
||||
" 'This is a sample input to demonstrate performance of distiled models especially inference time', \n",
|
||||
" return_tensors=\"pt\"\n",
|
||||
")\n",
|
||||
@@ -514,7 +514,7 @@
|
||||
"de_bert = BertModel.from_pretrained(\"dbmdz/bert-base-german-cased\")\n",
|
||||
"de_tokenizer = BertTokenizer.from_pretrained(\"dbmdz/bert-base-german-cased\")\n",
|
||||
"\n",
|
||||
"de_input = de_tokenizer.encode_plus(\n",
|
||||
"de_input = de_tokenizer(\n",
|
||||
" \"Hugging Face ist eine französische Firma mit Sitz in New-York.\",\n",
|
||||
" return_tensors=\"pt\"\n",
|
||||
")\n",
|
||||
@@ -559,4 +559,4 @@
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
}
|
||||
@@ -248,7 +248,7 @@
|
||||
"cpu_model = create_model_for_provider(\"onnx/bert-base-cased.onnx\", \"CPUExecutionProvider\")\n",
|
||||
"\n",
|
||||
"# Inputs are provided through numpy array\n",
|
||||
"model_inputs = tokenizer.encode_plus(\"My name is Bert\", return_tensors=\"pt\")\n",
|
||||
"model_inputs = tokenizer(\"My name is Bert\", return_tensors=\"pt\")\n",
|
||||
"inputs_onnx = {k: v.cpu().detach().numpy() for k, v in model_inputs.items()}\n",
|
||||
"\n",
|
||||
"# Run the model (None = get all the outputs)\n",
|
||||
|
||||
Reference in New Issue
Block a user