Update tokenizers to 0.7.0-rc5 (#3705)
This commit is contained in:
@@ -178,7 +178,7 @@
|
|||||||
"from tokenizers.pre_tokenizers import ByteLevel\n",
|
"from tokenizers.pre_tokenizers import ByteLevel\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# First we create an empty Byte-Pair Encoding model (i.e. not trained model)\n",
|
"# First we create an empty Byte-Pair Encoding model (i.e. not trained model)\n",
|
||||||
"tokenizer = Tokenizer(BPE.empty())\n",
|
"tokenizer = Tokenizer(BPE())\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Then we enable lower-casing and unicode-normalization\n",
|
"# Then we enable lower-casing and unicode-normalization\n",
|
||||||
"# The Sequence normalizer allows us to combine multiple Normalizer that will be\n",
|
"# The Sequence normalizer allows us to combine multiple Normalizer that will be\n",
|
||||||
@@ -307,7 +307,7 @@
|
|||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# Let's tokenizer a simple input\n",
|
"# Let's tokenizer a simple input\n",
|
||||||
"tokenizer.model = BPE.from_files('vocab.json', 'merges.txt')\n",
|
"tokenizer.model = BPE('vocab.json', 'merges.txt')\n",
|
||||||
"encoding = tokenizer.encode(\"This is a simple input to be tokenized\")\n",
|
"encoding = tokenizer.encode(\"This is a simple input to be tokenized\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(\"Encoded string: {}\".format(encoding.tokens))\n",
|
"print(\"Encoded string: {}\".format(encoding.tokens))\n",
|
||||||
|
|||||||
2
setup.py
2
setup.py
@@ -96,7 +96,7 @@ setup(
|
|||||||
packages=find_packages("src"),
|
packages=find_packages("src"),
|
||||||
install_requires=[
|
install_requires=[
|
||||||
"numpy",
|
"numpy",
|
||||||
"tokenizers == 0.7.0rc3",
|
"tokenizers == 0.7.0rc5",
|
||||||
# dataclasses for Python versions that don't have it
|
# dataclasses for Python versions that don't have it
|
||||||
"dataclasses;python_version<'3.7'",
|
"dataclasses;python_version<'3.7'",
|
||||||
# accessing files from S3 directly
|
# accessing files from S3 directly
|
||||||
|
|||||||
@@ -265,12 +265,10 @@ class _OpenAIGPTCharBPETokenizer(BaseTokenizer):
|
|||||||
):
|
):
|
||||||
if vocab_file is not None and merges_file is not None:
|
if vocab_file is not None and merges_file is not None:
|
||||||
tokenizer = Tokenizer(
|
tokenizer = Tokenizer(
|
||||||
BPE.from_files(
|
BPE(vocab_file, merges_file, dropout=dropout, unk_token=unk_token, end_of_word_suffix=suffix)
|
||||||
vocab_file, merges_file, dropout=dropout, unk_token=unk_token, end_of_word_suffix=suffix
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
tokenizer = Tokenizer(BPE.empty())
|
tokenizer = Tokenizer(BPE())
|
||||||
|
|
||||||
# Check for Unicode normalization first (before everything else)
|
# Check for Unicode normalization first (before everything else)
|
||||||
normalizers = []
|
normalizers = []
|
||||||
|
|||||||
@@ -362,7 +362,7 @@ class _TransfoXLDelimiterLookupTokenizer(BaseTokenizer):
|
|||||||
):
|
):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tokenizer = WordLevel.from_files(vocab_file, unk_token=unk_token)
|
tokenizer = WordLevel(vocab_file, unk_token=unk_token)
|
||||||
tokenizer = Tokenizer(tokenizer)
|
tokenizer = Tokenizer(tokenizer)
|
||||||
except Exception:
|
except Exception:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
|||||||
Reference in New Issue
Block a user