Add CamemBERT to auto files and docs

This commit is contained in:
Louis MARTIN
2019-11-12 17:41:41 -08:00
committed by Julien Chaumond
parent 694d4fcbb6
commit 035fea5315
3 changed files with 18 additions and 4 deletions

View File

@@ -27,6 +27,7 @@ from .tokenization_xlnet import XLNetTokenizer
from .tokenization_xlm import XLMTokenizer
from .tokenization_roberta import RobertaTokenizer
from .tokenization_distilbert import DistilBertTokenizer
from .tokenization_camembert import CamembertTokenizer
logger = logging.getLogger(__name__)
@@ -41,6 +42,7 @@ class AutoTokenizer(object):
The tokenizer class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
- contains `camembert`: CamembertTokenizer (CamemBERT model)
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
- contains `roberta`: RobertaTokenizer (RoBERTa model)
- contains `bert`: BertTokenizer (Bert model)
@@ -64,8 +66,9 @@ class AutoTokenizer(object):
The tokenizer class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
- contains `camembert`: CamembertTokenizer (CamemBERT model)
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
- contains `roberta`: RobertaTokenizer (XLM model)
- contains `roberta`: RobertaTokenizer (RoBERTa model)
- contains `bert`: BertTokenizer (Bert model)
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
@@ -103,6 +106,8 @@ class AutoTokenizer(object):
"""
if 'distilbert' in pretrained_model_name_or_path:
return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'camembert' in pretrained_model_name_or_path:
return CamembertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'roberta' in pretrained_model_name_or_path:
return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'bert' in pretrained_model_name_or_path:
@@ -121,4 +126,4 @@ class AutoTokenizer(object):
return CTRLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path))
"'xlm', 'roberta', 'camembert', 'ctrl'".format(pretrained_model_name_or_path))