From d32ce2c8df7053c19061b709465cdcc765e45a15 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Mon, 18 Nov 2019 14:14:19 +0100 Subject: [PATCH 1/4] camembert: add wrapper for CamembertForTokenClassification --- transformers/modeling_camembert.py | 38 +++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/transformers/modeling_camembert.py b/transformers/modeling_camembert.py index 05538926e2..f302346f2d 100644 --- a/transformers/modeling_camembert.py +++ b/transformers/modeling_camembert.py @@ -20,7 +20,7 @@ from __future__ import (absolute_import, division, print_function, import logging -from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, RobertaForMultipleChoice +from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, RobertaForMultipleChoice, RobertaForTokenClassification from .configuration_camembert import CamembertConfig from .file_utils import add_start_docstrings @@ -255,3 +255,39 @@ class CamembertForMultipleChoice(RobertaForMultipleChoice): """ config_class = CamembertConfig pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP + + +@add_start_docstrings("""CamemBERT Model with a token classification head on top (a linear layer on top of + the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, + CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING) +class CamembertForTokenClassification(RobertaForTokenClassification): + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for computing the token classification loss. + Indices should be in ``[0, ..., config.num_labels - 1]``. + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification loss. + **scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)`` + Classification scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = CamembertTokenizer.from_pretrained('camembert-base') + model = CamembertForTokenClassification.from_pretrained('camembert-base') + input_ids = torch.tensor(tokenizer.encode("J'aime le camembert !", add_special_tokens=True)).unsqueeze(0) # Batch size 1 + labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + loss, scores = outputs[:2] + + """ + config_class = CamembertConfig + pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP From 33753d9139307d9635db0309b6ddb9c53192c60a Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Mon, 18 Nov 2019 14:14:54 +0100 Subject: [PATCH 2/4] module: import CamembertForTokenClassification --- transformers/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/transformers/__init__.py b/transformers/__init__.py index cdf0669b39..5c7b0a6197 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -100,6 +100,7 @@ if is_torch_available(): DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP) from .modeling_camembert import (CamembertForMaskedLM, CamembertModel, CamembertForSequenceClassification, CamembertForMultipleChoice, + CamembertForTokenClassification, CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP) from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model From 0b3d45eb64607158977f546d57f90eae268c7836 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Mon, 18 Nov 2019 15:49:44 +0100 Subject: [PATCH 3/4] camembert: add implementation for save_vocabulary method --- transformers/tokenization_camembert.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/transformers/tokenization_camembert.py b/transformers/tokenization_camembert.py index 41d3d74cff..bf2a6fe993 100644 --- a/transformers/tokenization_camembert.py +++ b/transformers/tokenization_camembert.py @@ -16,9 +16,14 @@ from __future__ import (absolute_import, division, print_function, unicode_literals) +import logging +import os +from shutil import copyfile + import sentencepiece as spm from transformers.tokenization_utils import PreTrainedTokenizer +logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'} @@ -55,6 +60,7 @@ class CamembertTokenizer(PreTrainedTokenizer): self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(str(vocab_file)) + self.vocab_file = vocab_file # HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual # sentencepiece vocabulary (this is the case for and self.fairseq_tokens_to_ids = {'NOTUSED': 0, '': 1, 'NOTUSED': 2, '': 3} @@ -135,3 +141,17 @@ class CamembertTokenizer(PreTrainedTokenizer): if index in self.fairseq_ids_to_tokens: return self.fairseq_ids_to_tokens[index] return self.sp_model.IdToPiece(index - self.fairseq_offset) + + def save_vocabulary(self, save_directory): + """ Save the sentencepiece vocabulary (copy original file) and special tokens file + to a directory. + """ + if not os.path.isdir(save_directory): + logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) + return + out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file']) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) From 56c84863a1a20dfb82b928c5c9f77c21d9def8c7 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Mon, 18 Nov 2019 15:50:16 +0100 Subject: [PATCH 4/4] camembert: add support for CamemBERT in run_ner example --- examples/run_ner.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/run_ner.py b/examples/run_ner.py index 4359e587ae..127d63a6cd 100644 --- a/examples/run_ner.py +++ b/examples/run_ner.py @@ -37,6 +37,7 @@ from transformers import AdamW, get_linear_schedule_with_warmup from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, BertTokenizer from transformers import RobertaConfig, RobertaForTokenClassification, RobertaTokenizer from transformers import DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer +from transformers import CamembertConfig, CamembertForTokenClassification, CamembertTokenizer logger = logging.getLogger(__name__) @@ -47,7 +48,8 @@ ALL_MODELS = sum( MODEL_CLASSES = { "bert": (BertConfig, BertForTokenClassification, BertTokenizer), "roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer), - "distilbert": (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer) + "distilbert": (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer), + "camembert": (CamembertConfig, CamembertForTokenClassification, CamembertTokenizer), }