From fb6c70a91d3183742ce0a6d97add68103253ca3a Mon Sep 17 00:00:00 2001 From: Louis MARTIN Date: Tue, 12 Nov 2019 17:11:49 -0800 Subject: [PATCH] Update tokenization_camembert.py with urls --- transformers/tokenization_camembert.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/transformers/tokenization_camembert.py b/transformers/tokenization_camembert.py index 9facf7d911..0a6e751351 100644 --- a/transformers/tokenization_camembert.py +++ b/transformers/tokenization_camembert.py @@ -11,7 +11,7 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. +# limitations under the License """ Tokenization classes for Camembert model.""" from __future__ import (absolute_import, division, print_function, unicode_literals) @@ -20,6 +20,19 @@ import sentencepiece as spm from transformers.tokenization_utils import PreTrainedTokenizer +VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'} + +PRETRAINED_VOCAB_FILES_MAP = { + 'vocab_file': + { + 'camembert-base': "https://dl.fbaipublicfiles.com/camembert/camembert-base-v0-sentencepiece.bpe.model", + } +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + 'camembert-base': None, +} + class CamembertTokenizer(PreTrainedTokenizer): """ Adapted from RobertaTokenizer and XLNetTokenizer @@ -27,7 +40,9 @@ class CamembertTokenizer(PreTrainedTokenizer): - requires `SentencePiece `_ """ - vocab_files_names = {'vocab_file': None} + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__(self, vocab_file, bos_token="", eos_token="", sep_token="", cls_token="", unk_token="", pad_token='', mask_token='', **kwargs):