Update tokenizers to 0.7.0-rc5 (#3705)
This commit is contained in:
@@ -178,7 +178,7 @@
|
||||
"from tokenizers.pre_tokenizers import ByteLevel\n",
|
||||
"\n",
|
||||
"# First we create an empty Byte-Pair Encoding model (i.e. not trained model)\n",
|
||||
"tokenizer = Tokenizer(BPE.empty())\n",
|
||||
"tokenizer = Tokenizer(BPE())\n",
|
||||
"\n",
|
||||
"# Then we enable lower-casing and unicode-normalization\n",
|
||||
"# The Sequence normalizer allows us to combine multiple Normalizer that will be\n",
|
||||
@@ -307,7 +307,7 @@
|
||||
],
|
||||
"source": [
|
||||
"# Let's tokenizer a simple input\n",
|
||||
"tokenizer.model = BPE.from_files('vocab.json', 'merges.txt')\n",
|
||||
"tokenizer.model = BPE('vocab.json', 'merges.txt')\n",
|
||||
"encoding = tokenizer.encode(\"This is a simple input to be tokenized\")\n",
|
||||
"\n",
|
||||
"print(\"Encoded string: {}\".format(encoding.tokens))\n",
|
||||
|
||||
Reference in New Issue
Block a user