From 035fea53157b43d311b1e1164395398c4ec0dda5 Mon Sep 17 00:00:00 2001 From: Louis MARTIN Date: Tue, 12 Nov 2019 17:41:41 -0800 Subject: [PATCH] Add CamemBERT to auto files and docs --- docs/source/pretrained_models.rst | 6 +++++- transformers/configuration_auto.py | 7 ++++++- transformers/tokenization_auto.py | 9 +++++++-- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst index edb47e7f1c..b0a578fd80 100644 --- a/docs/source/pretrained_models.rst +++ b/docs/source/pretrained_models.rst @@ -155,5 +155,9 @@ Here is the full list of the currently provided pretrained models together with | CTRL | ``ctrl`` | | 48-layer, 1280-hidden, 16-heads, 1.6B parameters | | | | | Salesforce's Large-sized CTRL English model | +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| CamemBERT | ``camembert-base`` | | 12-layer, 768-hidden, 12-heads, 110M parameters | +| | | | CamemBERT using the BERT-base architecture | +| | | (see `details `__) | ++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ -.. `__ \ No newline at end of file +.. `__ diff --git a/transformers/configuration_auto.py b/transformers/configuration_auto.py index edd21a670c..2906136139 100644 --- a/transformers/configuration_auto.py +++ b/transformers/configuration_auto.py @@ -27,6 +27,7 @@ from .configuration_xlm import XLMConfig from .configuration_roberta import RobertaConfig from .configuration_distilbert import DistilBertConfig from .configuration_ctrl import CTRLConfig +from .configuration_camembert import CamembertConfig logger = logging.getLogger(__name__) @@ -50,6 +51,7 @@ class AutoConfig(object): - contains `xlnet`: XLNetConfig (XLNet model) - contains `xlm`: XLMConfig (XLM model) - contains `roberta`: RobertaConfig (RoBERTa model) + - contains `camembert`: CamembertConfig (CamemBERT model) - contains `ctrl` : CTRLConfig (CTRL model) This class cannot be instantiated using `__init__()` (throw an error). """ @@ -72,6 +74,7 @@ class AutoConfig(object): - contains `xlnet`: XLNetConfig (XLNet model) - contains `xlm`: XLMConfig (XLM model) - contains `roberta`: RobertaConfig (RoBERTa model) + - contains `camembert`: CamembertConfig (CamemBERT model) - contains `ctrl` : CTRLConfig (CTRL model) Params: pretrained_model_name_or_path: either: @@ -116,6 +119,8 @@ class AutoConfig(object): """ if 'distilbert' in pretrained_model_name_or_path: return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + elif 'camembert' in pretrained_model_name_or_path: + return CamembertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) elif 'roberta' in pretrained_model_name_or_path: return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) elif 'bert' in pretrained_model_name_or_path: @@ -134,4 +139,4 @@ class AutoConfig(object): return CTRLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) raise ValueError("Unrecognized model identifier in {}. Should contains one of " "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " - "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path)) + "'xlm', 'roberta', 'camembert', 'ctrl'".format(pretrained_model_name_or_path)) diff --git a/transformers/tokenization_auto.py b/transformers/tokenization_auto.py index ec056de17f..4510159905 100644 --- a/transformers/tokenization_auto.py +++ b/transformers/tokenization_auto.py @@ -27,6 +27,7 @@ from .tokenization_xlnet import XLNetTokenizer from .tokenization_xlm import XLMTokenizer from .tokenization_roberta import RobertaTokenizer from .tokenization_distilbert import DistilBertTokenizer +from .tokenization_camembert import CamembertTokenizer logger = logging.getLogger(__name__) @@ -41,6 +42,7 @@ class AutoTokenizer(object): The tokenizer class to instantiate is selected as the first pattern matching in the `pretrained_model_name_or_path` string (in the following order): + - contains `camembert`: CamembertTokenizer (CamemBERT model) - contains `distilbert`: DistilBertTokenizer (DistilBert model) - contains `roberta`: RobertaTokenizer (RoBERTa model) - contains `bert`: BertTokenizer (Bert model) @@ -64,8 +66,9 @@ class AutoTokenizer(object): The tokenizer class to instantiate is selected as the first pattern matching in the `pretrained_model_name_or_path` string (in the following order): + - contains `camembert`: CamembertTokenizer (CamemBERT model) - contains `distilbert`: DistilBertTokenizer (DistilBert model) - - contains `roberta`: RobertaTokenizer (XLM model) + - contains `roberta`: RobertaTokenizer (RoBERTa model) - contains `bert`: BertTokenizer (Bert model) - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model) - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model) @@ -103,6 +106,8 @@ class AutoTokenizer(object): """ if 'distilbert' in pretrained_model_name_or_path: return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + elif 'camembert' in pretrained_model_name_or_path: + return CamembertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) elif 'roberta' in pretrained_model_name_or_path: return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) elif 'bert' in pretrained_model_name_or_path: @@ -121,4 +126,4 @@ class AutoTokenizer(object): return CTRLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) raise ValueError("Unrecognized model identifier in {}. Should contains one of " "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " - "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path)) + "'xlm', 'roberta', 'camembert', 'ctrl'".format(pretrained_model_name_or_path))