From d32ce2c8df7053c19061b709465cdcc765e45a15 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan.schweter@bsb-muenchen.de>
Date: Mon, 18 Nov 2019 14:14:19 +0100
Subject: [PATCH 1/4] camembert: add wrapper for
 CamembertForTokenClassification

---
 transformers/modeling_camembert.py | 38 +++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/transformers/modeling_camembert.py b/transformers/modeling_camembert.py
index 05538926e2..f302346f2d 100644
--- a/transformers/modeling_camembert.py
+++ b/transformers/modeling_camembert.py
@@ -20,7 +20,7 @@ from __future__ import (absolute_import, division, print_function,
 
 import logging
 
-from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, RobertaForMultipleChoice
+from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, RobertaForMultipleChoice, RobertaForTokenClassification
 from .configuration_camembert import CamembertConfig
 from .file_utils import add_start_docstrings
 
@@ -255,3 +255,39 @@ class CamembertForMultipleChoice(RobertaForMultipleChoice):
     """
     config_class = CamembertConfig
     pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+
+
+@add_start_docstrings("""CamemBERT Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    CAMEMBERT_START_DOCSTRING, CAMEMBERT_INPUTS_DOCSTRING)
+class CamembertForTokenClassification(RobertaForTokenClassification):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the token classification loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification loss.
+        **scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
+            Classification scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
+        model = CamembertForTokenClassification.from_pretrained('camembert-base')
+        input_ids = torch.tensor(tokenizer.encode("J'aime le camembert !", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, scores = outputs[:2]
+
+    """
+    config_class = CamembertConfig
+    pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP

From 33753d9139307d9635db0309b6ddb9c53192c60a Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan.schweter@bsb-muenchen.de>
Date: Mon, 18 Nov 2019 14:14:54 +0100
Subject: [PATCH 2/4] module: import CamembertForTokenClassification

---
 transformers/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/transformers/__init__.py b/transformers/__init__.py
index cdf0669b39..5c7b0a6197 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -100,6 +100,7 @@ if is_torch_available():
                                 DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
     from .modeling_camembert import (CamembertForMaskedLM, CamembertModel,
                                 CamembertForSequenceClassification, CamembertForMultipleChoice,
+                                CamembertForTokenClassification,
                                 CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
     from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
 

From 0b3d45eb64607158977f546d57f90eae268c7836 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan.schweter@bsb-muenchen.de>
Date: Mon, 18 Nov 2019 15:49:44 +0100
Subject: [PATCH 3/4] camembert: add implementation for save_vocabulary method

---
 transformers/tokenization_camembert.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/transformers/tokenization_camembert.py b/transformers/tokenization_camembert.py
index 41d3d74cff..bf2a6fe993 100644
--- a/transformers/tokenization_camembert.py
+++ b/transformers/tokenization_camembert.py
@@ -16,9 +16,14 @@
 from __future__ import (absolute_import, division, print_function,
                         unicode_literals)
 
+import logging
+import os
+from shutil import copyfile
+
 import sentencepiece as spm
 from transformers.tokenization_utils import PreTrainedTokenizer
 
+logger = logging.getLogger(__name__)
 
 VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'}
 
@@ -55,6 +60,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
         self.max_len_sentences_pair = self.max_len - 4  # take into account special tokens
         self.sp_model = spm.SentencePieceProcessor()
         self.sp_model.Load(str(vocab_file))
+        self.vocab_file = vocab_file
         # HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual
         # sentencepiece vocabulary (this is the case for <s> and </s>
         self.fairseq_tokens_to_ids = {'<s>NOTUSED': 0, '<pad>': 1, '</s>NOTUSED': 2, '<unk>': 3}
@@ -135,3 +141,17 @@ class CamembertTokenizer(PreTrainedTokenizer):
         if index in self.fairseq_ids_to_tokens:
             return self.fairseq_ids_to_tokens[index]
         return self.sp_model.IdToPiece(index - self.fairseq_offset)
+
+    def save_vocabulary(self, save_directory):
+        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
+            to a directory.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)

From 56c84863a1a20dfb82b928c5c9f77c21d9def8c7 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan.schweter@bsb-muenchen.de>
Date: Mon, 18 Nov 2019 15:50:16 +0100
Subject: [PATCH 4/4] camembert: add support for CamemBERT in run_ner example

---
 examples/run_ner.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/run_ner.py b/examples/run_ner.py
index 4359e587ae..127d63a6cd 100644
--- a/examples/run_ner.py
+++ b/examples/run_ner.py
@@ -37,6 +37,7 @@ from transformers import AdamW, get_linear_schedule_with_warmup
 from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, BertTokenizer
 from transformers import RobertaConfig, RobertaForTokenClassification, RobertaTokenizer
 from transformers import DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer
+from transformers import CamembertConfig, CamembertForTokenClassification, CamembertTokenizer
 
 logger = logging.getLogger(__name__)
 
@@ -47,7 +48,8 @@ ALL_MODELS = sum(
 MODEL_CLASSES = {
     "bert": (BertConfig, BertForTokenClassification, BertTokenizer),
     "roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer),
-    "distilbert": (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer)
+    "distilbert": (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer),
+    "camembert": (CamembertConfig, CamembertForTokenClassification, CamembertTokenizer),
 }