From b7cf9f43d259fbad45d899c1769110aafc9f410a Mon Sep 17 00:00:00 2001 From: Anthony MOI Date: Fri, 10 Apr 2020 14:23:49 -0400 Subject: [PATCH] Update tokenizers to 0.7.0-rc5 (#3705) --- notebooks/01-training-tokenizers.ipynb | 4 ++-- setup.py | 2 +- src/transformers/tokenization_openai.py | 6 ++---- src/transformers/tokenization_transfo_xl.py | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/notebooks/01-training-tokenizers.ipynb b/notebooks/01-training-tokenizers.ipynb index 59def58eb4..96c25c7998 100644 --- a/notebooks/01-training-tokenizers.ipynb +++ b/notebooks/01-training-tokenizers.ipynb @@ -178,7 +178,7 @@ "from tokenizers.pre_tokenizers import ByteLevel\n", "\n", "# First we create an empty Byte-Pair Encoding model (i.e. not trained model)\n", - "tokenizer = Tokenizer(BPE.empty())\n", + "tokenizer = Tokenizer(BPE())\n", "\n", "# Then we enable lower-casing and unicode-normalization\n", "# The Sequence normalizer allows us to combine multiple Normalizer that will be\n", @@ -307,7 +307,7 @@ ], "source": [ "# Let's tokenizer a simple input\n", - "tokenizer.model = BPE.from_files('vocab.json', 'merges.txt')\n", + "tokenizer.model = BPE('vocab.json', 'merges.txt')\n", "encoding = tokenizer.encode(\"This is a simple input to be tokenized\")\n", "\n", "print(\"Encoded string: {}\".format(encoding.tokens))\n", diff --git a/setup.py b/setup.py index d201ae88d1..10026a0ad8 100644 --- a/setup.py +++ b/setup.py @@ -96,7 +96,7 @@ setup( packages=find_packages("src"), install_requires=[ "numpy", - "tokenizers == 0.7.0rc3", + "tokenizers == 0.7.0rc5", # dataclasses for Python versions that don't have it "dataclasses;python_version<'3.7'", # accessing files from S3 directly diff --git a/src/transformers/tokenization_openai.py b/src/transformers/tokenization_openai.py index ea0f52a806..9b150d4772 100644 --- a/src/transformers/tokenization_openai.py +++ b/src/transformers/tokenization_openai.py @@ -265,12 +265,10 @@ class _OpenAIGPTCharBPETokenizer(BaseTokenizer): ): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( - BPE.from_files( - vocab_file, merges_file, dropout=dropout, unk_token=unk_token, end_of_word_suffix=suffix - ) + BPE(vocab_file, merges_file, dropout=dropout, unk_token=unk_token, end_of_word_suffix=suffix) ) else: - tokenizer = Tokenizer(BPE.empty()) + tokenizer = Tokenizer(BPE()) # Check for Unicode normalization first (before everything else) normalizers = [] diff --git a/src/transformers/tokenization_transfo_xl.py b/src/transformers/tokenization_transfo_xl.py index e5aca49d3a..2392394bbe 100644 --- a/src/transformers/tokenization_transfo_xl.py +++ b/src/transformers/tokenization_transfo_xl.py @@ -362,7 +362,7 @@ class _TransfoXLDelimiterLookupTokenizer(BaseTokenizer): ): try: - tokenizer = WordLevel.from_files(vocab_file, unk_token=unk_token) + tokenizer = WordLevel(vocab_file, unk_token=unk_token) tokenizer = Tokenizer(tokenizer) except Exception: raise ValueError(