From d0e594f9dbdb3cf19a8b4e255b59f7632be571e2 Mon Sep 17 00:00:00 2001
From: Dima <dimagalat@users.noreply.github.com>
Date: Thu, 2 Jan 2020 09:45:48 +1100
Subject: [PATCH 01/17] Releasing file lock

---
 src/transformers/file_utils.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 60cc9baa6c..8788c75acc 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -388,13 +388,13 @@ def get_from_cache(
                 # we are copying the file before closing it, so flush to avoid truncation
                 temp_file.flush()
 
-                logger.info("storing %s in cache at %s", url, cache_path)
-                os.rename(temp_file.name, cache_path)
+            logger.info("storing %s in cache at %s", url, cache_path)
+            os.rename(temp_file.name, cache_path)
 
-                logger.info("creating metadata file for %s", cache_path)
-                meta = {"url": url, "etag": etag}
-                meta_path = cache_path + ".json"
-                with open(meta_path, "w") as meta_file:
-                    json.dump(meta, meta_file)
+            logger.info("creating metadata file for %s", cache_path)
+            meta = {"url": url, "etag": etag}
+            meta_path = cache_path + ".json"
+            with open(meta_path, "w") as meta_file:
+                json.dump(meta, meta_file)
 
     return cache_path

From e2810edc8f5c6f08b0f05a548b2761b35135b6de Mon Sep 17 00:00:00 2001
From: Dima Galat <dima.galat@outlook.com>
Date: Tue, 7 Jan 2020 11:47:25 +1100
Subject: [PATCH 02/17] removing redundant .flush

---
 src/transformers/file_utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 8788c75acc..e74c895485 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -385,8 +385,6 @@ def get_from_cache(
                 else:
                     http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent)
 
-                # we are copying the file before closing it, so flush to avoid truncation
-                temp_file.flush()
 
             logger.info("storing %s in cache at %s", url, cache_path)
             os.rename(temp_file.name, cache_path)

From 2926852f143d9889b4c7e95dbe9690e9d2f667ef Mon Sep 17 00:00:00 2001
From: Dima Galat <dima.galat@outlook.com>
Date: Tue, 7 Jan 2020 11:56:03 +1100
Subject: [PATCH 03/17] fixed formatting

---
 src/transformers/file_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index e74c895485..c2689f6a47 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -385,7 +385,6 @@ def get_from_cache(
                 else:
                     http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent)
 
-
             logger.info("storing %s in cache at %s", url, cache_path)
             os.rename(temp_file.name, cache_path)
 

From f26a353057a0dde55b506408954457773e6835cc Mon Sep 17 00:00:00 2001
From: Rishabh Manoj <Rishabh.Manoj@iiitb.org>
Date: Wed, 8 Jan 2020 21:12:34 +0530
Subject: [PATCH 04/17] Update pipelines.py

Modified QA pipeline to consider all features for each example before generating topk answers.
Current pipeline only takes one SquadExample, one SquadFeature, one start logit list, one end logit list to retrieve the answer, this is not correct as one SquadExample can produce multiple SquadFeatures.
---
 src/transformers/pipelines.py | 96 +++++++++++++++++++----------------
 1 file changed, 53 insertions(+), 43 deletions(-)

diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py
index 886d452375..45c137b343 100755
--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -705,55 +705,65 @@ class QuestionAnsweringPipeline(Pipeline):
 
         # Convert inputs to features
         examples = self._args_parser(*texts, **kwargs)
-        features = squad_convert_examples_to_features(
-            examples, self.tokenizer, kwargs["max_seq_len"], kwargs["doc_stride"], kwargs["max_question_len"], False
-        )
-        fw_args = self.inputs_for_model([f.__dict__ for f in features])
+        features_list = [ squad_convert_examples_to_features(
+                            [example], 
+                            self.tokenizer, 
+                            kwargs["max_seq_len"], 
+                            kwargs["doc_stride"], 
+                            kwargs["max_question_len"], 
+                            False
+                            ) for example in examples ]
+        all_answers = []
+        for features, example in zip(features_list, examples):
+            fw_args = self.inputs_for_model([f.__dict__ for f in features])
 
-        # Manage tensor allocation on correct device
-        with self.device_placement():
-            if self.framework == "tf":
-                fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()}
-                start, end = self.model(fw_args)
-                start, end = start.numpy(), end.numpy()
-            else:
-                with torch.no_grad():
-                    # Retrieve the score for the context tokens only (removing question tokens)
-                    fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()}
-                    start, end = self.model(**fw_args)
-                    start, end = start.cpu().numpy(), end.cpu().numpy()
+            # Manage tensor allocation on correct device
+            with self.device_placement():
+                if self.framework == "tf":
+                    fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()}
+                    start, end = self.model(fw_args)
+                    start, end = start.numpy(), end.numpy()
+                else:
+                    with torch.no_grad():
+                        # Retrieve the score for the context tokens only (removing question tokens)
+                        fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()}
+                        start, end = self.model(**fw_args)
+                        start, end = start.cpu().numpy(), end.cpu().numpy()
 
-        answers = []
-        for (example, feature, start_, end_) in zip(examples, features, start, end):
-            # Normalize logits and spans to retrieve the answer
-            start_ = np.exp(start_) / np.sum(np.exp(start_))
-            end_ = np.exp(end_) / np.sum(np.exp(end_))
+            answers = []
+            for (feature, start_, end_) in zip(features, start, end):
+                # Normalize logits and spans to retrieve the answer
+                start_ = np.exp(start_) / np.sum(np.exp(start_))
+                end_ = np.exp(end_) / np.sum(np.exp(end_))
 
-            # Mask padding and question
-            start_, end_ = start_ * np.abs(np.array(feature.p_mask) - 1), end_ * np.abs(np.array(feature.p_mask) - 1)
+                # Mask padding and question
+                start_, end_ = start_ * np.abs(np.array(feature.p_mask) - 1), end_ * np.abs(np.array(feature.p_mask) - 1)
 
-            # TODO : What happens if not possible
-            # Mask CLS
-            start_[0] = end_[0] = 0
+                # TODO : What happens if not possible
+                # Mask CLS
+                start_[0] = end_[0] = 0
 
-            starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"])
-            char_to_word = np.array(example.char_to_word_offset)
+                starts, ends, scores = self.decode(start_, end_, kwargs["topk"], kwargs["max_answer_len"])
+                char_to_word = np.array(example.char_to_word_offset)
 
-            # Convert the answer (tokens) back to the original text
-            answers += [
-                {
-                    "score": score.item(),
-                    "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
-                    "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
-                    "answer": " ".join(
-                        example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1]
-                    ),
-                }
-                for s, e, score in zip(starts, ends, scores)
-            ]
-        if len(answers) == 1:
-            return answers[0]
-        return answers
+                # Convert the answer (tokens) back to the original text
+                answers += [
+                    {
+                        "score": score.item(),
+                        "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
+                        "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
+                        "answer": " ".join(
+                            example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1]
+                        ),
+                    }
+                    for s, e, score in zip(starts, ends, scores)
+                ]
+            answers = sorted(answers, key = lambda x:x['score'], reverse=True)[:kwargs["topk"]]    
+            all_answers+=answers
+            
+        if len(all_answers) == 1:
+           return all_answers[0]
+        return all_answers
 
     def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
         """

From 90d3b787f64815cc6d5dfefce327210086e72019 Mon Sep 17 00:00:00 2001
From: Martin Schrimpf <mschrimpf@users.noreply.github.com>
Date: Fri, 10 Jan 2020 15:09:10 -0500
Subject: [PATCH 05/17] move rp_bucket to relative_attention_bias' device

otherwise, `rp_bucket` will always be on cpu and fail if `self.relative_attention_bias` is on cuda
---
 src/transformers/modeling_t5.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/modeling_t5.py b/src/transformers/modeling_t5.py
index 81906e86ea..fb37f6fa4e 100644
--- a/src/transformers/modeling_t5.py
+++ b/src/transformers/modeling_t5.py
@@ -286,6 +286,7 @@ class T5Attention(nn.Module):
             bidirectional=not self.is_decoder,
             num_buckets=self.relative_attention_num_buckets,
         )
+        rp_bucket = rp_bucket.to(self.relative_attention_bias.weight.device)
         values = self.relative_attention_bias(rp_bucket)  # shape (qlen, klen, num_heads)
         values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, qlen, klen)
         return values

From ebba9e929dcea98e65d1f0e9c535d38370ee24b1 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Fri, 10 Jan 2020 19:14:58 -0500
Subject: [PATCH 06/17] minor spring cleaning - missing configs + processing

---
 examples/distillation/lm_seqs_dataset.py        | 17 +++++++++++++++++
 .../distilbert-base-multilingual-cased.json     | 15 +++++++++++++++
 .../training_configs/distilroberta-base.json    | 14 ++++++++++++++
 3 files changed, 46 insertions(+)
 create mode 100644 examples/distillation/training_configs/distilbert-base-multilingual-cased.json
 create mode 100644 examples/distillation/training_configs/distilroberta-base.json

diff --git a/examples/distillation/lm_seqs_dataset.py b/examples/distillation/lm_seqs_dataset.py
index 691e010cf2..a29e9efb28 100644
--- a/examples/distillation/lm_seqs_dataset.py
+++ b/examples/distillation/lm_seqs_dataset.py
@@ -42,6 +42,7 @@ class LmSeqsDataset(Dataset):
         self.check()
         self.remove_long_sequences()
         self.remove_empty_sequences()
+        self.remove_unknown_sequences()
         self.check()
         self.print_statistics()
 
@@ -109,6 +110,22 @@ class LmSeqsDataset(Dataset):
         new_size = len(self)
         logger.info(f"Remove {init_size - new_size} too short (<=11 tokens) sequences.")
 
+    def remove_unknown_sequences(self):
+        """
+        Remove sequences with a (too) high level of unknown tokens.
+        """
+        if 'unk_token' not in self.params.special_tok_ids:
+            return
+        else:
+            unk_token_id = self.params.special_tok_ids['unk_token']
+        init_size = len(self)
+        unk_occs = np.array([np.count_nonzero(a == unk_token_id) for a in self.token_ids])
+        indices = (unk_occs/self.lengths) < 0.5
+        self.token_ids = self.token_ids[indices]
+        self.lengths = self.lengths[indices]
+        new_size = len(self)
+        logger.info(f'Remove {init_size - new_size} sequences with a high level of unknown tokens (50%).')
+
     def print_statistics(self):
         """
         Print some statistics on the corpus. Only the master process.
diff --git a/examples/distillation/training_configs/distilbert-base-multilingual-cased.json b/examples/distillation/training_configs/distilbert-base-multilingual-cased.json
new file mode 100644
index 0000000000..f76e7febcb
--- /dev/null
+++ b/examples/distillation/training_configs/distilbert-base-multilingual-cased.json
@@ -0,0 +1,15 @@
+{
+	"activation": "gelu",
+	"attention_dropout": 0.1,
+	"dim": 768,
+	"dropout": 0.1,
+	"hidden_dim": 3072,
+	"initializer_range": 0.02,
+	"max_position_embeddings": 512,
+	"n_heads": 12,
+	"n_layers": 6,
+	"sinusoidal_pos_embds": true,
+	"tie_weights_": true,
+	"vocab_size": 119547
+  }
+  
\ No newline at end of file
diff --git a/examples/distillation/training_configs/distilroberta-base.json b/examples/distillation/training_configs/distilroberta-base.json
new file mode 100644
index 0000000000..2d90ef6380
--- /dev/null
+++ b/examples/distillation/training_configs/distilroberta-base.json
@@ -0,0 +1,14 @@
+{
+    "vocab_size": 50265,
+    "hidden_size": 768,
+    "num_hidden_layers": 6,
+    "num_attention_heads": 12,
+    "intermediate_size": 3072,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "attention_probs_dropout_prob": 0.1,
+    "max_position_embeddings": 514,
+    "type_vocab_size": 1,
+    "initializer_range": 0.02,
+    "layer_norm_eps": 0.00001
+}
\ No newline at end of file

From e83d9f1c1d29890dd470de74f41627630e52abdc Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Fri, 10 Jan 2020 19:34:25 -0500
Subject: [PATCH 07/17] cleaning - change ' to " (black requirements)

---
 examples/distillation/lm_seqs_dataset.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/distillation/lm_seqs_dataset.py b/examples/distillation/lm_seqs_dataset.py
index a29e9efb28..8f444f4e0e 100644
--- a/examples/distillation/lm_seqs_dataset.py
+++ b/examples/distillation/lm_seqs_dataset.py
@@ -114,17 +114,17 @@ class LmSeqsDataset(Dataset):
         """
         Remove sequences with a (too) high level of unknown tokens.
         """
-        if 'unk_token' not in self.params.special_tok_ids:
+        if "unk_token" not in self.params.special_tok_ids:
             return
         else:
-            unk_token_id = self.params.special_tok_ids['unk_token']
+            unk_token_id = self.params.special_tok_ids["unk_token"]
         init_size = len(self)
         unk_occs = np.array([np.count_nonzero(a == unk_token_id) for a in self.token_ids])
-        indices = (unk_occs/self.lengths) < 0.5
+        indices = (unk_occs / self.lengths) < 0.5
         self.token_ids = self.token_ids[indices]
         self.lengths = self.lengths[indices]
         new_size = len(self)
-        logger.info(f'Remove {init_size - new_size} sequences with a high level of unknown tokens (50%).')
+        logger.info(f"Remove {init_size - new_size} sequences with a high level of unknown tokens (50%).")
 
     def print_statistics(self):
         """

From a3085020ed0d81d4903c50967687192e3101e770 Mon Sep 17 00:00:00 2001
From: IWillPull <52743253+IWillPull@users.noreply.github.com>
Date: Sat, 11 Jan 2020 06:00:07 +0200
Subject: [PATCH 08/17] Added repetition penalty to PPLM example (#2436)

* Added repetition penalty

* Default PPLM repetition_penalty to neutral

* Minor modifications to comply with reviewer's suggestions. (j -> token_idx)

* Formatted code with `make style`
---
 examples/pplm/run_pplm.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/examples/pplm/run_pplm.py b/examples/pplm/run_pplm.py
index 8c405b56ad..b334a0098c 100644
--- a/examples/pplm/run_pplm.py
+++ b/examples/pplm/run_pplm.py
@@ -344,6 +344,7 @@ def full_text_generation(
     gamma=1.5,
     gm_scale=0.9,
     kl_scale=0.01,
+    repetition_penalty=1.0,
     **kwargs
 ):
     classifier, class_id = get_classifier(discrim, class_label, device)
@@ -368,7 +369,14 @@ def full_text_generation(
         raise Exception("Specify either a bag of words or a discriminator")
 
     unpert_gen_tok_text, _, _ = generate_text_pplm(
-        model=model, tokenizer=tokenizer, context=context, device=device, length=length, sample=sample, perturb=False
+        model=model,
+        tokenizer=tokenizer,
+        context=context,
+        device=device,
+        length=length,
+        sample=sample,
+        perturb=False,
+        repetition_penalty=repetition_penalty,
     )
     if device == "cuda":
         torch.cuda.empty_cache()
@@ -401,6 +409,7 @@ def full_text_generation(
             gamma=gamma,
             gm_scale=gm_scale,
             kl_scale=kl_scale,
+            repetition_penalty=repetition_penalty,
         )
         pert_gen_tok_texts.append(pert_gen_tok_text)
         if classifier is not None:
@@ -437,6 +446,7 @@ def generate_text_pplm(
     gamma=1.5,
     gm_scale=0.9,
     kl_scale=0.01,
+    repetition_penalty=1.0,
 ):
     output_so_far = None
     if context:
@@ -508,6 +518,13 @@ def generate_text_pplm(
 
         pert_logits, past, pert_all_hidden = model(last, past=pert_past)
         pert_logits = pert_logits[:, -1, :] / temperature  # + SMALL_CONST
+
+        for token_idx in set(output_so_far[0].tolist()):
+            if pert_logits[0, token_idx] < 0:
+                pert_logits[0, token_idx] *= repetition_penalty
+            else:
+                pert_logits[0, token_idx] /= repetition_penalty
+
         pert_probs = F.softmax(pert_logits, dim=-1)
 
         if classifier is not None:
@@ -588,6 +605,7 @@ def run_pplm_example(
     seed=0,
     no_cuda=False,
     colorama=False,
+    repetition_penalty=1.0,
 ):
     # set Random seed
     torch.manual_seed(seed)
@@ -655,6 +673,7 @@ def run_pplm_example(
         gamma=gamma,
         gm_scale=gm_scale,
         kl_scale=kl_scale,
+        repetition_penalty=repetition_penalty,
     )
 
     # untokenize unperturbed text
@@ -767,6 +786,9 @@ if __name__ == "__main__":
     parser.add_argument("--seed", type=int, default=0)
     parser.add_argument("--no_cuda", action="store_true", help="no cuda")
     parser.add_argument("--colorama", action="store_true", help="colors keywords")
+    parser.add_argument(
+        "--repetition_penalty", type=float, default=1.0, help="Penalize repetition. More than 1.0 -> less repetition",
+    )
 
     args = parser.parse_args()
     run_pplm_example(**vars(args))

From 0d6c17fc1b272bca65eef4011365d4d81ec0a8d9 Mon Sep 17 00:00:00 2001
From: Morgan Funtowicz <morgan@huggingface.co>
Date: Mon, 13 Jan 2020 11:18:27 +0100
Subject: [PATCH 09/17] black formatting

---
 src/transformers/pipelines.py | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py
index 45c137b343..71f851b2e1 100755
--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -705,14 +705,17 @@ class QuestionAnsweringPipeline(Pipeline):
 
         # Convert inputs to features
         examples = self._args_parser(*texts, **kwargs)
-        features_list = [ squad_convert_examples_to_features(
-                            [example], 
-                            self.tokenizer, 
-                            kwargs["max_seq_len"], 
-                            kwargs["doc_stride"], 
-                            kwargs["max_question_len"], 
-                            False
-                            ) for example in examples ]
+        features_list = [
+            squad_convert_examples_to_features(
+                [example],
+                self.tokenizer,
+                kwargs["max_seq_len"],
+                kwargs["doc_stride"],
+                kwargs["max_question_len"],
+                False,
+            )
+            for example in examples
+        ]
         all_answers = []
         for features, example in zip(features_list, examples):
             fw_args = self.inputs_for_model([f.__dict__ for f in features])
@@ -737,7 +740,10 @@ class QuestionAnsweringPipeline(Pipeline):
                 end_ = np.exp(end_) / np.sum(np.exp(end_))
 
                 # Mask padding and question
-                start_, end_ = start_ * np.abs(np.array(feature.p_mask) - 1), end_ * np.abs(np.array(feature.p_mask) - 1)
+                start_, end_ = (
+                    start_ * np.abs(np.array(feature.p_mask) - 1),
+                    end_ * np.abs(np.array(feature.p_mask) - 1),
+                )
 
                 # TODO : What happens if not possible
                 # Mask CLS
@@ -758,11 +764,11 @@ class QuestionAnsweringPipeline(Pipeline):
                     }
                     for s, e, score in zip(starts, ends, scores)
                 ]
-            answers = sorted(answers, key = lambda x:x['score'], reverse=True)[:kwargs["topk"]]    
-            all_answers+=answers
-            
+            answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: kwargs["topk"]]
+            all_answers += answers
+
         if len(all_answers) == 1:
-           return all_answers[0]
+            return all_answers[0]
         return all_answers
 
     def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:

From 632682726f64f83014e4259dc42195da6d817695 Mon Sep 17 00:00:00 2001
From: Lysandre Debut <lysandre.debut@reseau.eseo.fr>
Date: Sun, 12 Jan 2020 21:53:19 +0100
Subject: [PATCH 10/17] Updated Configurations

---
 docs/source/model_doc/albert.rst             |   2 +-
 src/transformers/configuration_albert.py     |  97 +++++++++++------
 src/transformers/configuration_auto.py       | 103 ++++++++----------
 src/transformers/configuration_bert.py       |  61 +++++++----
 src/transformers/configuration_camembert.py  |  13 +++
 src/transformers/configuration_ctrl.py       |  74 ++++++-------
 src/transformers/configuration_distilbert.py |  46 +++++++-
 src/transformers/configuration_gpt2.py       |  72 ++++++------
 src/transformers/configuration_mmbt.py       |  10 +-
 src/transformers/configuration_openai.py     |  61 +++++++----
 src/transformers/configuration_roberta.py    |  13 +++
 src/transformers/configuration_transfo_xl.py | 100 +++++++++++------
 src/transformers/configuration_xlm.py        | 109 +++++++++++++------
 src/transformers/configuration_xlnet.py      |  86 +++++++++------
 14 files changed, 528 insertions(+), 319 deletions(-)

diff --git a/docs/source/model_doc/albert.rst b/docs/source/model_doc/albert.rst
index 92970c9328..5cf3f5ee7b 100644
--- a/docs/source/model_doc/albert.rst
+++ b/docs/source/model_doc/albert.rst
@@ -1,7 +1,7 @@
 ALBERT
 ----------------------------------------------------
 
-``AlbrtConfig``
+``AlbertConfig``
 ~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.AlbertConfig
diff --git a/src/transformers/configuration_albert.py b/src/transformers/configuration_albert.py
index 1d6adfa7e9..bcf6f7f361 100644
--- a/src/transformers/configuration_albert.py
+++ b/src/transformers/configuration_albert.py
@@ -31,9 +31,73 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 
 class AlbertConfig(PretrainedConfig):
-    """Configuration for `AlbertModel`.
+    r"""
+        This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`.
+        It is used to instantiate an ALBERT model according to the specified arguments, defining the model
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+        the ALBERT xxlarge architecture.
 
-    The default settings match the configuration of model `albert_xxlarge`.
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+        for more information.
+
+
+        Args:
+            vocab_size (:obj:`int`, optional, defaults to 30000):
+                Vocabulary size of the ALBERT model. Defines the different tokens that
+                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
+            embedding_size (:obj:`int`, optional, defaults to 128):
+                Size of vocabulary embeddings.
+            hidden_size (:obj:`int`, optional, defaults to 4096):
+                Size of the encoder layers and the pooler layer.
+            num_hidden_layers (:obj:`int`, optional, defaults to 12):
+                Number of hidden layers in the Transformer encoder.
+            num_hidden_groups (:obj:`int`, optional, defaults to 1):
+                Number of groups for the hidden layers, parameters in the same group are shared.
+            num_attention_heads (:obj:`int`, optional, defaults to 64):
+                Number of attention heads for each attention layer in the Transformer encoder.
+            intermediate_size (:obj:`int`, optional, defaults to 16384):
+                The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+            inner_group_num (:obj:`int`, optional, defaults to 1):
+                The number of inner repetition of attention and ffn.
+            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
+                The non-linear activation function (function or string) in the encoder and pooler.
+                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+            hidden_dropout_prob (:obj:`float`, optional, defaults to 0):
+                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0):
+                The dropout ratio for the attention probabilities.
+            max_position_embeddings (:obj:`int`, optional, defaults to 512):
+                The maximum sequence length that this model might ever be used with. Typically set this to something
+                large (e.g., 512 or 1024 or 2048).
+            type_vocab_size (:obj:`int`, optional, defaults to 2):
+                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.AlbertModel`.
+            initializer_range (:obj:`float`, optional, defaults to 0.02):
+                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+                The epsilon used by the layer normalization layers.
+
+        Example::
+
+            # Initializing an ALBERT-xxlarge style configuration
+            albert_xxlarge_configuration = AlbertConfig()
+
+            # Initializing an ALBERT-base style configuration
+            albert_base_configuration = AlbertConfig(
+                hidden_size=768,
+                num_attention_heads=12,
+                intermediate_size=3072,
+            )
+
+            # Initializing a model from the ALBERT-base style configuration
+            model = AlbertModel(bert_base_configuration)
+
+            # Accessing the model configuration
+            configuration = model.config
+
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
     """
 
     pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
@@ -57,35 +121,6 @@ class AlbertConfig(PretrainedConfig):
         layer_norm_eps=1e-12,
         **kwargs
     ):
-        """Constructs AlbertConfig.
-
-        Args:
-            vocab_size: Vocabulary size of `inputs_ids` in `AlbertModel`.
-            embedding_size: size of voc embeddings.
-            hidden_size: Size of the encoder layers and the pooler layer.
-            num_hidden_layers: Number of hidden layers in the Transformer encoder.
-            num_hidden_groups: Number of group for the hidden layers, parameters in
-                the same group are shared.
-            num_attention_heads: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
-                layer in the Transformer encoder.
-            inner_group_num: int, number of inner repetition of attention and ffn.
-            down_scale_factor: float, the scale to apply
-            hidden_act: The non-linear activation function (function or string) in the
-                encoder and pooler.
-            hidden_dropout_prob: The dropout probability for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob: The dropout ratio for the attention
-                probabilities.
-            max_position_embeddings: The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048).
-            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
-                `AlbertModel`.
-            initializer_range: The stdev of the truncated_normal_initializer for
-                initializing all weight matrices.
-        """
         super(AlbertConfig, self).__init__(**kwargs)
 
         self.vocab_size = vocab_size
diff --git a/src/transformers/configuration_auto.py b/src/transformers/configuration_auto.py
index 32a0385eca..8ba9515435 100644
--- a/src/transformers/configuration_auto.py
+++ b/src/transformers/configuration_auto.py
@@ -57,29 +57,13 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
 
 
 class AutoConfig(object):
-    r""":class:`~transformers.AutoConfig` is a generic configuration class
+    r"""
+        :class:`~transformers.AutoConfig` is a generic configuration class
         that will be instantiated as one of the configuration classes of the library
-        when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+        when created with the :func:`~transformers.AutoConfig.from_pretrained` class method.
 
-        The `from_pretrained()` method take care of returning the correct model class instance
-        using pattern matching on the `pretrained_model_name_or_path` string.
-
-        The base model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `distilbert`: DistilBertConfig (DistilBERT model)
-            - contains `albert`: AlbertConfig (ALBERT model)
-            - contains `camembert`: CamembertConfig (CamemBERT model)
-            - contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model)
-            - contains `roberta`: RobertaConfig (RoBERTa model)
-            - contains `bert`: BertConfig (Bert model)
-            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
-            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
-            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
-            - contains `xlnet`: XLNetConfig (XLNet model)
-            - contains `xlm`: XLMConfig (XLM model)
-            - contains `ctrl` : CTRLConfig (CTRL model)
-        This class cannot be instantiated using `__init__()` (throw an error).
+        The :func:`~transformers.AutoConfig.from_pretrained` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string argument.
     """
 
     def __init__(self):
@@ -94,6 +78,8 @@ class AutoConfig(object):
             return DistilBertConfig(*args, **kwargs)
         elif "roberta" in model_type:
             return RobertaConfig(*args, **kwargs)
+        elif "albert" in model_type:
+            return AlbertConfig(*args, **kwargs)
         elif "bert" in model_type:
             return BertConfig(*args, **kwargs)
         elif "openai-gpt" in model_type:
@@ -108,8 +94,6 @@ class AutoConfig(object):
             return XLMConfig(*args, **kwargs)
         elif "ctrl" in model_type:
             return CTRLConfig(*args, **kwargs)
-        elif "albert" in model_type:
-            return AlbertConfig(*args, **kwargs)
         elif "camembert" in model_type:
             return CamembertConfig(*args, **kwargs)
         raise ValueError(
@@ -120,59 +104,60 @@ class AutoConfig(object):
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        r""" Instantiate a one of the configuration classes of the library
+        r""" Instantiates one of the configuration classes of the library
         from a pre-trained model configuration.
 
         The configuration class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `t5`: T5Config (T5 model)
-            - contains `distilbert`: DistilBertConfig (DistilBERT model)
-            - contains `albert`: AlbertConfig (ALBERT model)
-            - contains `camembert`: CamembertConfig (CamemBERT model)
-            - contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model)
-            - contains `roberta`: RobertaConfig (RoBERTa model)
-            - contains `bert`: BertConfig (Bert model)
-            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
-            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
-            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
-            - contains `xlnet`: XLNetConfig (XLNet model)
-            - contains `xlm`: XLMConfig (XLM model)
-            - contains `ctrl` : CTRLConfig (CTRL model)
-        Params:
-            pretrained_model_name_or_path: either:
+            - contains `t5`: :class:`~transformers.T5Config` (T5 model)
+            - contains `distilbert`: :class:`~transformers.DistilBertConfig` (DistilBERT model)
+            - contains `albert`: :class:`~transformers.AlbertConfig` (ALBERT model)
+            - contains `camembert`: :class:`~transformers.CamembertConfig` (CamemBERT model)
+            - contains `xlm-roberta`: :class:`~transformers.XLMRobertaConfig` (XLM-RoBERTa model)
+            - contains `roberta`: :class:`~transformers.RobertaConfig` (RoBERTa model)
+            - contains `bert`: :class:`~transformers.BertConfig` (Bert model)
+            - contains `openai-gpt`: :class:`~transformers.OpenAIGPTConfig` (OpenAI GPT model)
+            - contains `gpt2`: :class:`~transformers.GPT2Config` (OpenAI GPT-2 model)
+            - contains `transfo-xl`: :class:`~transformers.TransfoXLConfig` (Transformer-XL model)
+            - contains `xlnet`: :class:`~transformers.XLNetConfig` (XLNet model)
+            - contains `xlm`: :class:`~transformers.XLMConfig` (XLM model)
+            - contains `ctrl` : :class:`~transformers.CTRLConfig` (CTRL model)
 
-                - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
-                - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
 
-            cache_dir: (`optional`) string:
+        Args:
+            pretrained_model_name_or_path (:obj:`string`):
+                Is either: \
+                    - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
+                    - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
+                    - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                    - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
+
+            cache_dir (:obj:`string`, optional, defaults to `None`):
                 Path to a directory in which a downloaded pre-trained model
                 configuration should be cached if the standard cache should not be used.
 
-            kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
+            force_download (:obj:`boolean`, optional, defaults to `False`):
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exist.
 
-                - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
-                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
+            resume_download (:obj:`boolean`, optional, defaults to `False`):
+                Do not delete incompletely received file. Attempt to resume the download if such a file exists.
 
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
-
-            resume_download: (`optional`) boolean, default False:
-                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
-
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
-
-            return_unused_kwargs: (`optional`) bool:
+            proxies (:obj:`Dict[str, str]`, optional, defaults to `None`):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`.
+                The proxies are used on each request. See `the requests documentation <https://requests.readthedocs.io/en/master/user/advanced/#proxies>`__ for usage.
 
+            return_unused_kwargs (:obj:`boolean`, optional, defaults to `False`):
                 - If False, then this function returns just the final configuration object.
                 - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
 
+            kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): key/value pairs with which to update the configuration object after loading.
+                - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
+                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
+
+
         Examples::
 
-            config = AutoConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            config = AutoConfig.from_pretrained('bert-base-uncased')  # Download configuration from S3 and cache.
             config = AutoConfig.from_pretrained('./test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
             config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
             config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
diff --git a/src/transformers/configuration_bert.py b/src/transformers/configuration_bert.py
index 32fa50a504..867ba61397 100644
--- a/src/transformers/configuration_bert.py
+++ b/src/transformers/configuration_bert.py
@@ -50,32 +50,44 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class BertConfig(PretrainedConfig):
     r"""
-        :class:`~transformers.BertConfig` is the configuration class to store the configuration of a
-        `BertModel`.
+        This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
+        It is used to instantiate an BERT model according to the specified arguments, defining the model
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+        the BERT bert-base-uncased architecture.
+
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+        for more information.
 
 
-        Arguments:
-            vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
-            hidden_size: Size of the encoder layers and the pooler layer.
-            num_hidden_layers: Number of hidden layers in the Transformer encoder.
-            num_attention_heads: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
-                layer in the Transformer encoder.
-            hidden_act: The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob: The dropout ratio for the attention
-                probabilities.
-            max_position_embeddings: The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048).
-            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
-                `BertModel`.
-            initializer_range: The sttdev of the truncated_normal_initializer for
-                initializing all weight matrices.
-            layer_norm_eps: The epsilon used by LayerNorm.
+        Args:
+            vocab_size (:obj:`int`, optional, defaults to 30522):
+                Vocabulary size of the BERT model. Defines the different tokens that
+                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
+            hidden_size (:obj:`int`, optional, defaults to 768):
+                Size of the encoder layers and the pooler layer.
+            num_hidden_layers (:obj:`int`, optional, defaults to 12):
+                Number of hidden layers in the Transformer encoder.
+            num_attention_heads (:obj:`int`, optional, defaults to 12):
+                Number of attention heads for each attention layer in the Transformer encoder.
+            intermediate_size (:obj:`int`, optional, defaults to 3072):
+                The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+                The non-linear activation function (function or string) in the encoder and pooler.
+                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+                The dropout ratio for the attention probabilities.
+            max_position_embeddings (:obj:`int`, optional, defaults to 512):
+                The maximum sequence length that this model might ever be used with.
+                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            type_vocab_size (:obj:`int`, optional, defaults to 2):
+                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
+            initializer_range (:obj:`float`, optional, defaults to 0.02):
+                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+                The epsilon used by the layer normalization layers.
     """
     pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
@@ -96,6 +108,7 @@ class BertConfig(PretrainedConfig):
         **kwargs
     ):
         super(BertConfig, self).__init__(**kwargs)
+
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
diff --git a/src/transformers/configuration_camembert.py b/src/transformers/configuration_camembert.py
index 8ecdf714b1..618169b8fc 100644
--- a/src/transformers/configuration_camembert.py
+++ b/src/transformers/configuration_camembert.py
@@ -29,4 +29,17 @@ CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 
 class CamembertConfig(RobertaConfig):
+    r"""
+        This is the configuration class to store the configuration of an :class:`~transformers.CamembertModel`.
+        It is used to instantiate an Camembert model according to the specified arguments, defining the model
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+        the BERT bert-base-uncased architecture.
+
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+        for more information.
+
+        The :class:`~transformers.CamembertConfig` class directly inherits :class:`~transformers.BertConfig`.
+        It reuses the same defaults. Please check the parent class for more information.
+    """
     pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
diff --git a/src/transformers/configuration_ctrl.py b/src/transformers/configuration_ctrl.py
index e23bf7a376..46113ac293 100644
--- a/src/transformers/configuration_ctrl.py
+++ b/src/transformers/configuration_ctrl.py
@@ -26,25 +26,43 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf
 
 
 class CTRLConfig(PretrainedConfig):
-    """Configuration class to store the configuration of a `CTRLModel`.
+    """
+        This is the configuration class to store the configuration of an :class:`~transformers.CTRLModel`.
+        It is used to instantiate an CTRL model according to the specified arguments, defining the model
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+        the CTRL architecture from SalesForce.
 
-    Args:
-        vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
-        n_positions: Number of positional embeddings.
-        n_ctx: Size of the causal mask (usually same as n_positions).
-        dff: Size of the inner dimension of the FFN.
-        n_embd: Dimensionality of the embeddings and hidden states.
-        n_layer: Number of hidden layers in the Transformer encoder.
-        n_head: Number of attention heads for each attention layer in
-            the Transformer encoder.
-        layer_norm_epsilon: epsilon to use in the layer norm layers
-        resid_pdrop: The dropout probabilitiy for all fully connected
-            layers in the embeddings, encoder, and pooler.
-        attn_pdrop: The dropout ratio for the attention
-            probabilities.
-        embd_pdrop: The dropout ratio for the embeddings.
-        initializer_range: The sttdev of the truncated_normal_initializer for
-            initializing all weight matrices.
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+        for more information.
+
+        Args:
+            vocab_size (:obj:`int`, optional, defaults to 246534):
+                Vocabulary size of the CTRL model. Defines the different tokens that
+                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
+            n_positions (:obj:`int`, optional, defaults to 256):
+                The maximum sequence length that this model might ever be used with.
+                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            n_ctx (:obj:`int`, optional, defaults to 256):
+                Size of the causal mask (usually same as n_positions).
+            n_embd (:obj:`int`, optional, defaults to 1280):
+                Dimensionality of the embeddings and hidden states.
+            dff (:obj:`int`, optional, defaults to 8192):
+                Size of the inner dimension of the FFN.
+            n_layer (:obj:`int`, optional, defaults to 48):
+                Number of hidden layers in the Transformer encoder.
+            n_head (:obj:`int`, optional, defaults to 16):
+                Number of attention heads for each attention layer in the Transformer encoder.
+            resid_pdrop (:obj:`float`, optional, defaults to 0.1):
+                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+            embd_pdrop (:obj:`int`, optional, defaults to 0.1):
+                The dropout ratio for the embeddings.
+            attn_pdrop (:obj:`float`, optional, defaults to 0.1):
+                The dropout ratio for the attention.
+            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-6):
+                The epsilon to use in the layer normalization layers
+            initializer_range (:obj:`float`, optional, defaults to 0.02):
+                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
     """
 
     pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
@@ -70,26 +88,6 @@ class CTRLConfig(PretrainedConfig):
         summary_first_dropout=0.1,
         **kwargs
     ):
-        """Constructs CTRLConfig.
-
-        Args:
-            vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
-            n_positions: Number of positional embeddings.
-            n_ctx: Size of the causal mask (usually same as n_positions).
-            dff: Size of the inner dimension of the FFN.
-            n_embd: Dimensionality of the embeddings and hidden states.
-            n_layer: Number of hidden layers in the Transformer encoder.
-            n_head: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            layer_norm_epsilon: epsilon to use in the layer norm layers
-            resid_pdrop: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attn_pdrop: The dropout ratio for the attention
-                probabilities.
-            embd_pdrop: The dropout ratio for the embeddings.
-            initializer_range: The sttdev of the truncated_normal_initializer for
-                initializing all weight matrices.
-        """
         super(CTRLConfig, self).__init__(**kwargs)
         self.vocab_size = vocab_size
         self.n_ctx = n_ctx
diff --git a/src/transformers/configuration_distilbert.py b/src/transformers/configuration_distilbert.py
index 1dd4a11912..df0b73ed9c 100644
--- a/src/transformers/configuration_distilbert.py
+++ b/src/transformers/configuration_distilbert.py
@@ -31,6 +31,50 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 
 class DistilBertConfig(PretrainedConfig):
+    r"""
+        This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`.
+        It is used to instantiate a DistilBERT model according to the specified arguments, defining the model
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+        the DistilBERT distilbert-base-uncased architecture.
+
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+        for more information.
+
+
+        Args:
+            vocab_size (:obj:`int`, optional, defaults to 30522):
+                Vocabulary size of the DistilBERT model. Defines the different tokens that
+                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
+            max_position_embeddings (:obj:`int`, optional, defaults to 512):
+                The maximum sequence length that this model might ever be used with.
+                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            sinusoidal_pos_embds (:obj:`boolean`, optional, defaults to :obj:`False`):
+                Whether to use sinusoidal positional embeddings.
+            n_layers (:obj:`int`, optional, defaults to 6):
+                Number of hidden layers in the Transformer encoder.
+            n_heads (:obj:`int`, optional, defaults to 12):
+                Number of attention heads for each attention layer in the Transformer encoder.
+            dim (:obj:`int`, optional, defaults to 768):
+                Size of the encoder layers and the pooler layer.
+            intermediate_size (:obj:`int`, optional, defaults to 3072):
+                The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+            dropout (:obj:`float`, optional, defaults to 0.1):
+                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            attention_dropout (:obj:`float`, optional, defaults to 0.1):
+                The dropout ratio for the attention probabilities.
+            activation (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+                The non-linear activation function (function or string) in the encoder and pooler.
+                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+            initializer_range (:obj:`float`, optional, defaults to 0.02):
+                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            qa_dropout (:obj:`float`, optional, defaults to 0.1):
+                The dropout probabilities used in the question answering model
+                :class:`~tranformers.DistilBertForQuestionAnswering`.
+            seq_classif_dropout (:obj:`float`, optional, defaults to 0.2):
+                The dropout probabilities used in the sequence classification model
+                :class:`~tranformers.DistilBertForSequenceClassification`.
+    """
     pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(
@@ -46,7 +90,6 @@ class DistilBertConfig(PretrainedConfig):
         attention_dropout=0.1,
         activation="gelu",
         initializer_range=0.02,
-        tie_weights_=True,
         qa_dropout=0.1,
         seq_classif_dropout=0.2,
         **kwargs
@@ -63,7 +106,6 @@ class DistilBertConfig(PretrainedConfig):
         self.attention_dropout = attention_dropout
         self.activation = activation
         self.initializer_range = initializer_range
-        self.tie_weights_ = tie_weights_
         self.qa_dropout = qa_dropout
         self.seq_classif_dropout = seq_classif_dropout
 
diff --git a/src/transformers/configuration_gpt2.py b/src/transformers/configuration_gpt2.py
index 8da1800747..e421a7b10f 100644
--- a/src/transformers/configuration_gpt2.py
+++ b/src/transformers/configuration_gpt2.py
@@ -33,24 +33,42 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 
 class GPT2Config(PretrainedConfig):
-    """Configuration class to store the configuration of a `GPT2Model`.
+    """
+        This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`.
+        It is used to instantiate an GPT-2 model according to the specified arguments, defining the model
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+        the GPT-2 small architecture.
 
-    Args:
-        vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
-        n_positions: Number of positional embeddings.
-        n_ctx: Size of the causal mask (usually same as n_positions).
-        n_embd: Dimensionality of the embeddings and hidden states.
-        n_layer: Number of hidden layers in the Transformer encoder.
-        n_head: Number of attention heads for each attention layer in
-            the Transformer encoder.
-        layer_norm_epsilon: epsilon to use in the layer norm layers
-        resid_pdrop: The dropout probabilitiy for all fully connected
-            layers in the embeddings, encoder, and pooler.
-        attn_pdrop: The dropout ratio for the attention
-            probabilities.
-        embd_pdrop: The dropout ratio for the embeddings.
-        initializer_range: The sttdev of the truncated_normal_initializer for
-            initializing all weight matrices.
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+        for more information.
+
+
+        Args:
+            vocab_size (:obj:`int`, optional, defaults to 50257):
+                Vocabulary size of the GPT-2 model. Defines the different tokens that
+                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.GPT2Model`.
+            n_positions (:obj:`int`, optional, defaults to 1024):
+                The maximum sequence length that this model might ever be used with.
+                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            n_ctx (:obj:`int`, optional, defaults to 1024):
+                Size of the causal mask (usually same as n_positions).
+            n_embd (:obj:`int`, optional, defaults to 768):
+                Dimensionality of the embeddings and hidden states.
+            n_layer (:obj:`int`, optional, defaults to 12):
+                Number of hidden layers in the Transformer encoder.
+            n_head (:obj:`int`, optional, defaults to 12):
+                Number of attention heads for each attention layer in the Transformer encoder.
+            resid_pdrop (:obj:`float`, optional, defaults to 0.1):
+                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+            embd_pdrop (:obj:`int`, optional, defaults to 0.1):
+                The dropout ratio for the embeddings.
+            attn_pdrop (:obj:`float`, optional, defaults to 0.1):
+                The dropout ratio for the attention.
+            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
+                The epsilon to use in the layer normalization layers
+            initializer_range (:obj:`float`, optional, defaults to 16):
+                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
     """
 
     pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
@@ -75,26 +93,8 @@ class GPT2Config(PretrainedConfig):
         summary_first_dropout=0.1,
         **kwargs
     ):
-        """Constructs GPT2Config.
-
-        Args:
-            vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
-            n_positions: Number of positional embeddings.
-            n_ctx: Size of the causal mask (usually same as n_positions).
-            n_embd: Dimensionality of the embeddings and hidden states.
-            n_layer: Number of hidden layers in the Transformer encoder.
-            n_head: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            layer_norm_epsilon: epsilon to use in the layer norm layers
-            resid_pdrop: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attn_pdrop: The dropout ratio for the attention
-                probabilities.
-            embd_pdrop: The dropout ratio for the embeddings.
-            initializer_range: The sttdev of the truncated_normal_initializer for
-                initializing all weight matrices.
-        """
         super(GPT2Config, self).__init__(**kwargs)
+
         self.vocab_size = vocab_size
         self.n_ctx = n_ctx
         self.n_positions = n_positions
diff --git a/src/transformers/configuration_mmbt.py b/src/transformers/configuration_mmbt.py
index b072468e7f..56a35e237c 100644
--- a/src/transformers/configuration_mmbt.py
+++ b/src/transformers/configuration_mmbt.py
@@ -26,9 +26,13 @@ class MMBTConfig(object):
     """Configuration class to store the configuration of a `MMBT Model`.
 
     Args:
-        config: config of the underlying Transformer models. It's values are copied over to use a single config.
-        num_labels: Size of final Linear layer for classification.
-        modal_hidden_size: Embedding dimension of the non-text modality encoder.
+        config (:obj:`~transformers.PreTrainedConfig`):
+            Config of the underlying Transformer models. Its values are
+            copied over to use a single config.
+        num_labels (:obj:`int` or :obj:`None`, optional, defaults to `None`):
+            Size of final Linear layer for classification.
+        modal_hidden_size (:obj:`int`, optional, defautls to 2048):
+            Embedding dimension of the non-text modality encoder.
     """
 
     def __init__(self, config, num_labels=None, modal_hidden_size=2048):
diff --git a/src/transformers/configuration_openai.py b/src/transformers/configuration_openai.py
index d7e88bda92..28c501b77e 100644
--- a/src/transformers/configuration_openai.py
+++ b/src/transformers/configuration_openai.py
@@ -30,27 +30,45 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class OpenAIGPTConfig(PretrainedConfig):
     """
-    Configuration class to store the configuration of a `OpenAIGPTModel`.
+        This is the configuration class to store the configuration of an :class:`~transformers.OpenAIGPTModel`.
+        It is used to instantiate an GPT model according to the specified arguments, defining the model
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+        the GPT architecture from OpenAI.
 
-    Args:
-        vocab_size: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
-        n_positions: Number of positional embeddings.
-        n_ctx: Size of the causal mask (usually same as n_positions).
-        n_embd: Dimensionality of the embeddings and hidden states.
-        n_layer: Number of hidden layers in the Transformer encoder.
-        n_head: Number of attention heads for each attention layer in
-            the Transformer encoder.
-        afn: The non-linear activation function (function or string) in the
-            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-        resid_pdrop: The dropout probabilitiy for all fully connected
-            layers in the embeddings, encoder, and pooler.
-        attn_pdrop: The dropout ratio for the attention
-            probabilities.
-        embd_pdrop: The dropout ratio for the embeddings.
-        layer_norm_epsilon: epsilon to use in the layer norm layers
-        initializer_range: The sttdev of the truncated_normal_initializer for
-            initializing all weight matrices.
-        predict_special_tokens: should we predict special tokens (when the model has a LM head)
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+        for more information.
+
+        Args:
+            vocab_size (:obj:`int`, optional, defaults to 40478):
+                Vocabulary size of the GPT model. Defines the different tokens that
+                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.CTRLModel`.
+            n_positions (:obj:`int`, optional, defaults to 512):
+                The maximum sequence length that this model might ever be used with.
+                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            n_ctx (:obj:`int`, optional, defaults to 512):
+                Size of the causal mask (usually same as n_positions).
+            n_embd (:obj:`int`, optional, defaults to 768):
+                Dimensionality of the embeddings and hidden states.
+            n_layer (:obj:`int`, optional, defaults to 12):
+                Number of hidden layers in the Transformer encoder.
+            n_head (:obj:`int`, optional, defaults to 12):
+                Number of attention heads for each attention layer in the Transformer encoder.
+            afn (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+                The non-linear activation function (function or string) in the encoder and pooler.
+                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+            resid_pdrop (:obj:`float`, optional, defaults to 0.1):
+                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+            embd_pdrop (:obj:`int`, optional, defaults to 0.1):
+                The dropout ratio for the embeddings.
+            attn_pdrop (:obj:`float`, optional, defaults to 0.1):
+                The dropout ratio for the attention.
+            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
+                The epsilon to use in the layer normalization layers
+            initializer_range (:obj:`float`, optional, defaults to 0.02):
+                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`):
+                Whether special tokens should be predicted when the model is has a language modeling head.
     """
 
     pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
@@ -77,9 +95,8 @@ class OpenAIGPTConfig(PretrainedConfig):
         summary_first_dropout=0.1,
         **kwargs
     ):
-        """Constructs OpenAIGPTConfig.
-        """
         super(OpenAIGPTConfig, self).__init__(**kwargs)
+
         self.vocab_size = vocab_size
         self.n_ctx = n_ctx
         self.n_positions = n_positions
diff --git a/src/transformers/configuration_roberta.py b/src/transformers/configuration_roberta.py
index f505a699b1..5dc9776942 100644
--- a/src/transformers/configuration_roberta.py
+++ b/src/transformers/configuration_roberta.py
@@ -34,4 +34,17 @@ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 
 class RobertaConfig(BertConfig):
+    r"""
+        This is the configuration class to store the configuration of an :class:`~transformers.RobertaModel`.
+        It is used to instantiate an RoBERTa model according to the specified arguments, defining the model
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+        the BERT bert-base-uncased architecture.
+
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+        for more information.
+
+        The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`.
+        It reuses the same defaults. Please check the parent class for more information.
+    """
     pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
diff --git a/src/transformers/configuration_transfo_xl.py b/src/transformers/configuration_transfo_xl.py
index 7b285ca3ed..789f6c03a4 100644
--- a/src/transformers/configuration_transfo_xl.py
+++ b/src/transformers/configuration_transfo_xl.py
@@ -29,39 +29,74 @@ TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 
 class TransfoXLConfig(PretrainedConfig):
-    """Configuration class to store the configuration of a `TransfoXLModel`.
+    """
+        This is the configuration class to store the configuration of an :class:`~transformers.TransfoXLModel`.
+        It is used to instantiate a Transformer XL model according to the specified arguments, defining the model
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+        the Transformer XL architecture.
+
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+        for more information.
 
         Args:
-            vocab_size: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
-            cutoffs: cutoffs for the adaptive softmax
-            d_model: Dimensionality of the model's hidden states.
-            d_embed: Dimensionality of the embeddings
-            d_head: Dimensionality of the model's heads.
-            div_val: divident value for adapative input and softmax
-            pre_lnorm: apply LayerNorm to the input instead of the output
-            d_inner: Inner dimension in FF
-            n_layer: Number of hidden layers in the Transformer encoder.
-            n_head: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            tgt_len: number of tokens to predict
-            ext_len: length of the extended context
-            mem_len: length of the retained previous heads
-            same_length: use the same attn length for all tokens
-            proj_share_all_but_first: True to share all but first projs, False not to share.
-            attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
-            clamp_len: use the same pos embeddings after clamp_len
-            sample_softmax: number of samples in sampled softmax
-            adaptive: use adaptive softmax
-            tie_weight: tie the word embedding and softmax weights
-            dropout: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            dropatt: The dropout ratio for the attention probabilities.
-            untie_r: untie relative position biases
-            embd_pdrop: The dropout ratio for the embeddings.
-            init: parameter initializer to use
-            init_range: parameters initialized by U(-init_range, init_range).
-            proj_init_std: parameters initialized by N(0, init_std)
-            init_std: parameters initialized by N(0, init_std)
+            vocab_size (:obj:`int`, optional, defaults to 267735):
+                Vocabulary size of the Transformer XL model. Defines the different tokens that
+                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.TransfoXLModel`.
+            cutoffs (:obj:`List[int]`, optional, defaults to :obj:`[20000, 40000, 200000]`):
+                Cutoffs for the adaptive softmax
+            d_model (:obj:`int`, optional, defaults to 1024):
+                Dimensionality of the model's hidden states.
+            d_embed (:obj:`int`, optional, defaults to 1024):
+                Dimensionality of the embeddings
+            n_head (:obj:`int`, optional, defaults to 16):
+                Number of attention heads for each attention layer in the Transformer encoder.
+            d_head (:obj:`int`, optional, defaults to 64):
+                Dimensionality of the model's heads.
+            d_inner (:obj:`int`, optional, defaults to 4096):
+                Inner dimension in FF
+            div_val (:obj:`int`, optional, defaults to 4):
+                Divident value for adapative input and softmax
+            pre_lnorm (:obj:`boolean`, optional, defaults to :obj:`False`):
+                Apply LayerNorm to the input instead of the output
+            n_layer (:obj:`int`, optional, defaults to 18):
+                Number of hidden layers in the Transformer encoder.
+            tgt_len (:obj:`int`, optional, defaults to 128):
+                Number of tokens to predict
+            ext_len (:obj:`int`, optional, defaults to 0):
+                Length of the extended context
+            mem_len (:obj:`int`, optional, defaults to 1600):
+                Length of the retained previous heads
+            clamp_len (:obj:`int`, optional, defaults to 1000):
+                use the same pos embeddings after clamp_len
+            same_length (:obj:`boolean`, optional, defaults to :obj:`True`):
+                Use the same attn length for all tokens
+            proj_share_all_but_first (:obj:`boolean`, optional, defaults to :obj:`True`):
+                True to share all but first projs, False not to share.
+            attn_type (:obj:`int`, optional, defaults to 0):
+                Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
+            sample_softmax (:obj:`int`, optional, defaults to -1):
+                number of samples in sampled softmax
+            adaptive (:obj:`boolean`, optional, defaults to :obj:`True`):
+                use adaptive softmax
+            tie_weight (:obj:`boolean`, optional, defaults to :obj:`True`):
+                tie the word embedding and softmax weights
+            dropout (:obj:`float`, optional, defaults to 0.1):
+                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            dropatt (:obj:`float`, optional, defaults to 0):
+                The dropout ratio for the attention probabilities.
+            untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
+                Untie relative position biases
+            init (:obj:`string`, optional, defaults to `normal`):
+                Parameter initializer to use
+            init_range (:obj:`float`, optional, defaults to 0.01):
+                Parameters initialized by U(-init_range, init_range).
+            proj_init_std (:obj:`float`, optional, defaults to 0.01):
+                Parameters initialized by N(0, init_std)
+            init_std (:obj:`float`, optional, defaults to 0.02):
+                Parameters initialized by N(0, init_std)
+            layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
+                The epsilon to use in the layer normalization layers
     """
 
     pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
@@ -98,9 +133,8 @@ class TransfoXLConfig(PretrainedConfig):
         layer_norm_epsilon=1e-5,
         **kwargs
     ):
-        """Constructs TransfoXLConfig.
-        """
         super(TransfoXLConfig, self).__init__(**kwargs)
+
         self.vocab_size = vocab_size
         self.cutoffs = []
         self.cutoffs.extend(cutoffs)
diff --git a/src/transformers/configuration_xlm.py b/src/transformers/configuration_xlm.py
index b56182413b..0aa449ae7b 100644
--- a/src/transformers/configuration_xlm.py
+++ b/src/transformers/configuration_xlm.py
@@ -37,44 +37,81 @@ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 
 class XLMConfig(PretrainedConfig):
-    """Configuration class to store the configuration of a `XLMModel`.
+    """
+        This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`.
+        It is used to instantiate an XLM model according to the specified arguments, defining the model
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+        the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
 
-    Args:
-        vocab_size: Vocabulary size of `inputs_ids` in `XLMModel`.
-        d_model: Size of the encoder layers and the pooler layer.
-        n_layer: Number of hidden layers in the Transformer encoder.
-        n_head: Number of attention heads for each attention layer in
-            the Transformer encoder.
-        d_inner: The size of the "intermediate" (i.e., feed-forward)
-            layer in the Transformer encoder.
-        ff_activation: The non-linear activation function (function or string) in the
-            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-        untie_r: untie relative position biases
-        attn_type: 'bi' for XLM, 'uni' for Transformer-XL
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+        for more information.
 
-        dropout: The dropout probabilitiy for all fully connected
-            layers in the embeddings, encoder, and pooler.
-        max_position_embeddings: The maximum sequence length that this model might
-            ever be used with. Typically set this to something large just in case
-            (e.g., 512 or 1024 or 2048).
-        initializer_range: The sttdev of the truncated_normal_initializer for
-            initializing all weight matrices.
-        layer_norm_eps: The epsilon used by LayerNorm.
-
-        dropout: float, dropout rate.
-        init: str, the initialization scheme, either "normal" or "uniform".
-        init_range: float, initialize the parameters with a uniform distribution
-            in [-init_range, init_range]. Only effective when init="uniform".
-        init_std: float, initialize the parameters with a normal distribution
-            with mean 0 and stddev init_std. Only effective when init="normal".
-        mem_len: int, the number of tokens to cache.
-        reuse_len: int, the number of tokens in the currect batch to be cached
-            and reused in the future.
-        bi_data: bool, whether to use bidirectional input pipeline.
-            Usually set to True during pretraining and False during finetuning.
-        clamp_len: int, clamp all relative distances larger than clamp_len.
-            -1 means no clamping.
-        same_length: bool, whether to use the same attention length for each token.
+        Args:
+            vocab_size (:obj:`int`, optional, defaults to 30145):
+                Vocabulary size of the XLM model. Defines the different tokens that
+                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`.
+            emb_dim (:obj:`int`, optional, defaults to 2048):
+                Dimensionality of the encoder layers and the pooler layer.
+            n_layer (:obj:`int`, optional, defaults to 12):
+                Number of hidden layers in the Transformer encoder.
+            n_head (:obj:`int`, optional, defaults to 16):
+                Number of attention heads for each attention layer in the Transformer encoder.
+            dropout (:obj:`float`, optional, defaults to 0.1):
+                The dropout probability for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_dropout (:obj:`float`, optional, defaults to 0.1):
+                The dropout probability for the attention mechanism
+            gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`):
+                The non-linear activation function (function or string) in the
+                encoder and pooler. If set to `True`, "gelu" will be used instead of "relu".
+            sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`):
+                Whether to use sinusoidal positional embeddings instead of absolute positional embeddings.
+            causal (:obj:`boolean`, optional, defaults to :obj:`False`):
+                Set this to `True` for the model to behave in a causal manner.
+                Causal models use a triangular attention mask in order to only attend to the left-side context instead
+                if a bidirectional context.
+            asm (:obj:`boolean`, optional, defaults to :obj:`False`):
+                TODO
+            n_langs (:obj:`int`, optional, defaults to 1):
+                The number of languages the model handles. Set to 1 for monolingual models.
+            use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
+                Whether to use language embeddings. Some models use additional language embeddings, see
+                `the multilingual models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__
+                for information on how to use them.
+            max_position_embeddings (:obj:`int`, optional, defaults to 512):
+                The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5):
+                The standard deviation of the truncated_normal_initializer for
+                initializing the embedding matrices.
+            init_std (:obj:`int`, optional, defaults to 50257):
+                The standard deviation of the truncated_normal_initializer for
+                initializing all weight matrices except the embedding matrices.
+            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+                The epsilon used by the layer normalization layers.
+            bos_index (:obj:`int`, optional, defaults to 0):
+                The index of the beginning of sentence token in the vocabulary.
+            eos_index (:obj:`int`, optional, defaults to 1):
+                The index of the end of sentence token in the vocabulary.
+            pad_index (:obj:`int`, optional, defaults to 2):
+                The index of the padding token in the vocabulary.
+            unk_index (:obj:`int`, optional, defaults to 3):
+                The index of the unknown token in the vocabulary.
+            mask_index (:obj:`int`, optional, defaults to 5):
+                The index of the masking token in the vocabulary.
+            is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
+                Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
+            start_n_top (:obj:`int`, optional, defaults to 5):
+                TODO
+            end_n_top (:obj:`int`, optional, defaults to 5):
+                TODO
+            mask_token_id (:obj:`int`, optional, defaults to 0):
+                Model agnostic parameter to identify masked tokens when generating text in an MLM context.
+            lang_id (:obj:`int`, optional, defaults to 1):
+                The ID of the language used by the model. This parameter is used when generating
+                text in a given language.
     """
 
     pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
diff --git a/src/transformers/configuration_xlnet.py b/src/transformers/configuration_xlnet.py
index 38d00d7604..15337d3920 100644
--- a/src/transformers/configuration_xlnet.py
+++ b/src/transformers/configuration_xlnet.py
@@ -30,42 +30,60 @@ XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 
 class XLNetConfig(PretrainedConfig):
-    """Configuration class to store the configuration of a ``XLNetModel``.
+    """
+        This is the configuration class to store the configuration of a :class:`~transformers.XLNetModel`.
+        It is used to instantiate an XLNet model according to the specified arguments, defining the model
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+        the `xlnet-large-cased <https://huggingface.co/xlnet-large-cased>`__ architecture.
 
-    Args:
-        vocab_size: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
-        d_model: Size of the encoder layers and the pooler layer.
-        n_layer: Number of hidden layers in the Transformer encoder.
-        n_head: Number of attention heads for each attention layer in
-            the Transformer encoder.
-        d_inner: The size of the "intermediate" (i.e., feed-forward)
-            layer in the Transformer encoder.
-        ff_activation: The non-linear activation function (function or string) in the
-            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-        untie_r: untie relative position biases
-        attn_type: 'bi' for XLNet, 'uni' for Transformer-XL
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+        for more information.
 
-        dropout: The dropout probabilitiy for all fully connected
-            layers in the embeddings, encoder, and pooler.
-        initializer_range: The sttdev of the truncated_normal_initializer for
-            initializing all weight matrices.
-        layer_norm_eps: The epsilon used by LayerNorm.
-
-        dropout: float, dropout rate.
-        init: str, the initialization scheme, either "normal" or "uniform".
-        init_range: float, initialize the parameters with a uniform distribution
-            in [-init_range, init_range]. Only effective when init="uniform".
-        init_std: float, initialize the parameters with a normal distribution
-            with mean 0 and stddev init_std. Only effective when init="normal".
-        mem_len: int, the number of tokens to cache.
-        reuse_len: int, the number of tokens in the currect batch to be cached
-            and reused in the future.
-        bi_data: bool, whether to use bidirectional input pipeline.
-            Usually set to True during pretraining and False during finetuning.
-        clamp_len: int, clamp all relative distances larger than clamp_len.
-            -1 means no clamping.
-        same_length: bool, whether to use the same attention length for each token.
-        finetuning_task: name of the glue task on which the model was fine-tuned if any
+        Args:
+            vocab_size (:obj:`int`, optional, defaults to 32000):
+                Vocabulary size of the XLNet model. Defines the different tokens that
+                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLNetModel`.
+            d_model (:obj:`int`, optional, defaults to 1024):
+                Size of the encoder layers and the pooler layer.
+            n_layer (:obj:`int`, optional, defaults to 24):
+                Number of hidden layers in the Transformer encoder.
+            n_head (:obj:`int`, optional, defaults to 16):
+                Number of attention heads for each attention layer in the Transformer encoder.
+            d_inner (:obj:`int`, optional, defaults to 4096):
+                The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+            ff_activation (:obj:`string`, optional, defaults to "gelu"):
+                The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            untie_r (:obj:`boolean`, optional, defaults to :obj:`True`):
+                Untie relative position biases
+            attn_type (:obj:`string`, optional, defaults to "bi"):
+                The attention type used by the model. Set 'bi' for XLNet, 'uni' for Transformer-XL.
+            initializer_range (:obj:`float`, optional, defaults to 0.02):
+                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+                The epsilon used by the layer normalization layers.
+            dropout (:obj:`float`, optional, defaults to 0.1):
+                The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+            mem_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
+                The number of tokens to cache. The key/value pairs that have already been pre-computed
+                in a previous forward pass won't be re-computed. See the
+                `quickstart <https://huggingface.co/transformers/quickstart.html#using-the-past>`__
+                for more information.
+            reuse_len (:obj:`int` or :obj:`None`, optional, defaults to :obj:`None`):
+                The number of tokens in the current batch to be cached and reused in the future.
+            bi_data (:obj:`boolean`, optional, defaults to :obj:`False`):
+                Whether to use bidirectional input pipeline. Usually set to `True` during
+                pretraining and `False` during finetuning.
+            clamp_len (:obj:`int`, optional, defaults to -1):
+                Clamp all relative distances larger than clamp_len.
+                Setting this attribute to -1 means no clamping.
+            same_length (:obj:`boolean`, optional, defaults to :obj:`False`):
+                Whether to use the same attention length for each token.
+            start_n_top (:obj:`int`, optional, defaults to 5):
+                TODO
+            end_n_top (:obj:`int`, optional, defaults to 5):
+                TODO
     """
 
     pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP

From c11b6fd393288255d9958a4484b4a955308227bb Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Mon, 13 Jan 2020 12:43:55 +0100
Subject: [PATCH 11/17] Update links in all configurations

---
 src/transformers/configuration_albert.py     | 2 +-
 src/transformers/configuration_bert.py       | 2 +-
 src/transformers/configuration_camembert.py  | 2 +-
 src/transformers/configuration_ctrl.py       | 2 +-
 src/transformers/configuration_distilbert.py | 2 +-
 src/transformers/configuration_gpt2.py       | 2 +-
 src/transformers/configuration_openai.py     | 2 +-
 src/transformers/configuration_roberta.py    | 2 +-
 src/transformers/configuration_transfo_xl.py | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/transformers/configuration_albert.py b/src/transformers/configuration_albert.py
index bcf6f7f361..c86107e503 100644
--- a/src/transformers/configuration_albert.py
+++ b/src/transformers/configuration_albert.py
@@ -35,7 +35,7 @@ class AlbertConfig(PretrainedConfig):
         This is the configuration class to store the configuration of an :class:`~transformers.AlbertModel`.
         It is used to instantiate an ALBERT model according to the specified arguments, defining the model
         architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the ALBERT xxlarge architecture.
+        the ALBERT `xxlarge <https://huggingface.co/albert-xxlarge-v2>`__ architecture.
 
         Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
         to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
diff --git a/src/transformers/configuration_bert.py b/src/transformers/configuration_bert.py
index 867ba61397..62bc43f46e 100644
--- a/src/transformers/configuration_bert.py
+++ b/src/transformers/configuration_bert.py
@@ -53,7 +53,7 @@ class BertConfig(PretrainedConfig):
         This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
         It is used to instantiate an BERT model according to the specified arguments, defining the model
         architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the BERT bert-base-uncased architecture.
+        the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
 
         Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
         to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
diff --git a/src/transformers/configuration_camembert.py b/src/transformers/configuration_camembert.py
index 618169b8fc..0063e4ada3 100644
--- a/src/transformers/configuration_camembert.py
+++ b/src/transformers/configuration_camembert.py
@@ -33,7 +33,7 @@ class CamembertConfig(RobertaConfig):
         This is the configuration class to store the configuration of an :class:`~transformers.CamembertModel`.
         It is used to instantiate an Camembert model according to the specified arguments, defining the model
         architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the BERT bert-base-uncased architecture.
+        the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
 
         Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
         to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
diff --git a/src/transformers/configuration_ctrl.py b/src/transformers/configuration_ctrl.py
index 46113ac293..446ebffed7 100644
--- a/src/transformers/configuration_ctrl.py
+++ b/src/transformers/configuration_ctrl.py
@@ -30,7 +30,7 @@ class CTRLConfig(PretrainedConfig):
         This is the configuration class to store the configuration of an :class:`~transformers.CTRLModel`.
         It is used to instantiate an CTRL model according to the specified arguments, defining the model
         architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the CTRL architecture from SalesForce.
+        the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce.
 
         Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
         to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
diff --git a/src/transformers/configuration_distilbert.py b/src/transformers/configuration_distilbert.py
index df0b73ed9c..ffad013704 100644
--- a/src/transformers/configuration_distilbert.py
+++ b/src/transformers/configuration_distilbert.py
@@ -35,7 +35,7 @@ class DistilBertConfig(PretrainedConfig):
         This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel`.
         It is used to instantiate a DistilBERT model according to the specified arguments, defining the model
         architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the DistilBERT distilbert-base-uncased architecture.
+        the DistilBERT `distilbert-base-uncased <https://huggingface.co/distilbert-base-uncased>`__ architecture.
 
         Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
         to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
diff --git a/src/transformers/configuration_gpt2.py b/src/transformers/configuration_gpt2.py
index e421a7b10f..add006d4d2 100644
--- a/src/transformers/configuration_gpt2.py
+++ b/src/transformers/configuration_gpt2.py
@@ -37,7 +37,7 @@ class GPT2Config(PretrainedConfig):
         This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model`.
         It is used to instantiate an GPT-2 model according to the specified arguments, defining the model
         architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the GPT-2 small architecture.
+        the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.
 
         Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
         to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
diff --git a/src/transformers/configuration_openai.py b/src/transformers/configuration_openai.py
index 28c501b77e..3a68b8f5d8 100644
--- a/src/transformers/configuration_openai.py
+++ b/src/transformers/configuration_openai.py
@@ -33,7 +33,7 @@ class OpenAIGPTConfig(PretrainedConfig):
         This is the configuration class to store the configuration of an :class:`~transformers.OpenAIGPTModel`.
         It is used to instantiate an GPT model according to the specified arguments, defining the model
         architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the GPT architecture from OpenAI.
+        the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI.
 
         Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
         to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
diff --git a/src/transformers/configuration_roberta.py b/src/transformers/configuration_roberta.py
index 5dc9776942..a9c53d42aa 100644
--- a/src/transformers/configuration_roberta.py
+++ b/src/transformers/configuration_roberta.py
@@ -38,7 +38,7 @@ class RobertaConfig(BertConfig):
         This is the configuration class to store the configuration of an :class:`~transformers.RobertaModel`.
         It is used to instantiate an RoBERTa model according to the specified arguments, defining the model
         architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the BERT bert-base-uncased architecture.
+        the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
 
         Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
         to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
diff --git a/src/transformers/configuration_transfo_xl.py b/src/transformers/configuration_transfo_xl.py
index 789f6c03a4..e4052408d2 100644
--- a/src/transformers/configuration_transfo_xl.py
+++ b/src/transformers/configuration_transfo_xl.py
@@ -33,7 +33,7 @@ class TransfoXLConfig(PretrainedConfig):
         This is the configuration class to store the configuration of an :class:`~transformers.TransfoXLModel`.
         It is used to instantiate a Transformer XL model according to the specified arguments, defining the model
         architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the Transformer XL architecture.
+        the `Transformer XL <https://huggingface.co/transfo-xl-wt103>`__ architecture.
 
         Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
         to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`

From a1cb100460f482bfb9723469709e30d3c5f5b1ac Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Mon, 13 Jan 2020 13:11:37 +0100
Subject: [PATCH 12/17] Wrap up configurations

---
 src/transformers/configuration_gpt2.py   | 25 ++++++++++++++++++++
 src/transformers/configuration_openai.py | 25 ++++++++++++++++++++
 src/transformers/configuration_xlm.py    | 29 ++++++++++++++++++++++--
 src/transformers/configuration_xlnet.py  | 29 ++++++++++++++++++++++--
 4 files changed, 104 insertions(+), 4 deletions(-)

diff --git a/src/transformers/configuration_gpt2.py b/src/transformers/configuration_gpt2.py
index add006d4d2..2b9042bd75 100644
--- a/src/transformers/configuration_gpt2.py
+++ b/src/transformers/configuration_gpt2.py
@@ -69,6 +69,31 @@ class GPT2Config(PretrainedConfig):
                 The epsilon to use in the layer normalization layers
             initializer_range (:obj:`float`, optional, defaults to 16):
                 The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            summary_type (:obj:`string`, optional, defaults to "cls_index"):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.GPT2DoubleHeadsModel`.
+                Is one of the following options:
+                    - 'last' => take the last token hidden state (like XLNet)
+                    - 'first' => take the first token hidden state (like Bert)
+                    - 'mean' => take the mean of all tokens hidden states
+                    - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+                    - 'attn' => Not implemented now, use multi-head attention
+            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.GPT2DoubleHeadsModel`.
+                Add a projection after the vector extraction
+            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.GPT2DoubleHeadsModel`.
+                'tanh' => add a tanh activation to the output, Other => no activation.
+            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.GPT2DoubleHeadsModel`.
+                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.GPT2DoubleHeadsModel`.
+                Add a dropout before the projection and activation
     """
 
     pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
diff --git a/src/transformers/configuration_openai.py b/src/transformers/configuration_openai.py
index 3a68b8f5d8..e6fc1efce3 100644
--- a/src/transformers/configuration_openai.py
+++ b/src/transformers/configuration_openai.py
@@ -69,6 +69,31 @@ class OpenAIGPTConfig(PretrainedConfig):
                 The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
             predict_special_tokens (:obj:`boolean`, optional, defaults to :obj:`True`):
                 Whether special tokens should be predicted when the model is has a language modeling head.
+            summary_type (:obj:`string`, optional, defaults to "cls_index"):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+                Is one of the following options:
+                    - 'last' => take the last token hidden state (like XLNet)
+                    - 'first' => take the first token hidden state (like Bert)
+                    - 'mean' => take the mean of all tokens hidden states
+                    - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+                    - 'attn' => Not implemented now, use multi-head attention
+            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+                Add a projection after the vector extraction
+            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+                'tanh' => add a tanh activation to the output, Other => no activation.
+            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+                Add a dropout before the projection and activation
     """
 
     pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
diff --git a/src/transformers/configuration_xlm.py b/src/transformers/configuration_xlm.py
index 0aa449ae7b..3dc26047d8 100644
--- a/src/transformers/configuration_xlm.py
+++ b/src/transformers/configuration_xlm.py
@@ -103,10 +103,35 @@ class XLMConfig(PretrainedConfig):
                 The index of the masking token in the vocabulary.
             is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`):
                 Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
+            summary_type (:obj:`string`, optional, defaults to "first"):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.XLMForSequenceClassification`.
+                Is one of the following options:
+                    - 'last' => take the last token hidden state (like XLNet)
+                    - 'first' => take the first token hidden state (like Bert)
+                    - 'mean' => take the mean of all tokens hidden states
+                    - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+                    - 'attn' => Not implemented now, use multi-head attention
+            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.XLMForSequenceClassification`.
+                Add a projection after the vector extraction
+            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.XLMForSequenceClassification`.
+                'tanh' => add a tanh activation to the output, Other => no activation.
+            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.XLMForSequenceClassification`.
+                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+            summary_first_dropout (:obj:`float`, optional, defaults to 0.1):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.XLMForSequenceClassification`.
+                Add a dropout before the projection and activation
             start_n_top (:obj:`int`, optional, defaults to 5):
-                TODO
+                Used in the SQuAD evaluation script for XLM and XLNetV.
             end_n_top (:obj:`int`, optional, defaults to 5):
-                TODO
+                Used in the SQuAD evaluation script for XLM and XLNet.
             mask_token_id (:obj:`int`, optional, defaults to 0):
                 Model agnostic parameter to identify masked tokens when generating text in an MLM context.
             lang_id (:obj:`int`, optional, defaults to 1):
diff --git a/src/transformers/configuration_xlnet.py b/src/transformers/configuration_xlnet.py
index 15337d3920..7fe427344d 100644
--- a/src/transformers/configuration_xlnet.py
+++ b/src/transformers/configuration_xlnet.py
@@ -80,10 +80,35 @@ class XLNetConfig(PretrainedConfig):
                 Setting this attribute to -1 means no clamping.
             same_length (:obj:`boolean`, optional, defaults to :obj:`False`):
                 Whether to use the same attention length for each token.
+            summary_type (:obj:`string`, optional, defaults to "last"):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
+                Is one of the following options:
+                    - 'last' => take the last token hidden state (like XLNet)
+                    - 'first' => take the first token hidden state (like Bert)
+                    - 'mean' => take the mean of all tokens hidden states
+                    - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+                    - 'attn' => Not implemented now, use multi-head attention
+            summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
+                Add a projection after the vector extraction
+            summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
+                'tanh' => add a tanh activation to the output, Other => no activation.
+            summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
+                If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+            summary_last_dropout (:obj:`float`, optional, defaults to 0.1):
+                Argument used when doing sequence summary. Used in for the multiple choice head in
+                :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
+                Add a dropout after the projection and activation
             start_n_top (:obj:`int`, optional, defaults to 5):
-                TODO
+                Used in the SQuAD evaluation script for XLM and XLNetV.
             end_n_top (:obj:`int`, optional, defaults to 5):
-                TODO
+                Used in the SQuAD evaluation script for XLM and XLNet.
     """
 
     pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP

From 7d1bb7f2560c514d55561549e30899686fa1242c Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Mon, 13 Jan 2020 13:13:11 +0100
Subject: [PATCH 13/17] Add missing XLNet and XLM models

---
 docs/source/model_doc/xlm.rst   |  7 +++++++
 docs/source/model_doc/xlnet.rst | 21 +++++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/docs/source/model_doc/xlm.rst b/docs/source/model_doc/xlm.rst
index f7034bb9d8..0c2231be42 100644
--- a/docs/source/model_doc/xlm.rst
+++ b/docs/source/model_doc/xlm.rst
@@ -34,6 +34,13 @@ XLM
     :members:
 
 
+``XLMForQuestionAnsweringSimple``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMForQuestionAnsweringSimple
+    :members:
+
+
 ``XLMForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/model_doc/xlnet.rst b/docs/source/model_doc/xlnet.rst
index 4005ce3a0a..10f06b76ad 100644
--- a/docs/source/model_doc/xlnet.rst
+++ b/docs/source/model_doc/xlnet.rst
@@ -36,6 +36,27 @@ XLNet
     :members:
 
 
+``XLNetForTokenClassification``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLNetForTokenClassification`
+    :members:
+
+
+``XLNetForMultipleChoice``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLNetForMultipleChoice`
+    :members:
+
+
+``XLNetForQuestionAnsweringSimple``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLNetForQuestionAnsweringSimple
+    :members:
+
+
 ``XLNetForQuestionAnswering``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

From 387217bd3e9a564cd84d4c4cc3c2f25ce30966bc Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Mon, 13 Jan 2020 13:27:34 +0100
Subject: [PATCH 14/17] Added example usage

---
 docs/source/model_doc/xlnet.rst              |  4 ++--
 src/transformers/configuration_bert.py       | 17 +++++++++++++++++
 src/transformers/configuration_camembert.py  | 17 +++++++++++++++++
 src/transformers/configuration_ctrl.py       | 17 +++++++++++++++++
 src/transformers/configuration_distilbert.py | 17 +++++++++++++++++
 src/transformers/configuration_gpt2.py       | 17 +++++++++++++++++
 src/transformers/configuration_openai.py     | 17 +++++++++++++++++
 src/transformers/configuration_transfo_xl.py | 17 +++++++++++++++++
 src/transformers/configuration_xlm.py        | 19 ++++++++++++++++++-
 src/transformers/configuration_xlnet.py      | 19 ++++++++++++++++++-
 10 files changed, 157 insertions(+), 4 deletions(-)

diff --git a/docs/source/model_doc/xlnet.rst b/docs/source/model_doc/xlnet.rst
index 10f06b76ad..0317fa0d78 100644
--- a/docs/source/model_doc/xlnet.rst
+++ b/docs/source/model_doc/xlnet.rst
@@ -39,14 +39,14 @@ XLNet
 ``XLNetForTokenClassification``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.XLNetForTokenClassification`
+.. autoclass:: transformers.XLNetForTokenClassification
     :members:
 
 
 ``XLNetForMultipleChoice``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.XLNetForMultipleChoice`
+.. autoclass:: transformers.XLNetForMultipleChoice
     :members:
 
 
diff --git a/src/transformers/configuration_bert.py b/src/transformers/configuration_bert.py
index 62bc43f46e..c61fc418b7 100644
--- a/src/transformers/configuration_bert.py
+++ b/src/transformers/configuration_bert.py
@@ -88,6 +88,23 @@ class BertConfig(PretrainedConfig):
                 The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
             layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
                 The epsilon used by the layer normalization layers.
+
+        Example::
+
+            from transformers import BertModel, BertConfig
+
+            # Initializing a BERT bert-base-uncased style configuration
+            configuration = BertConfig()
+
+            # Initializing a model from the bert-base-uncased style configuration
+            model = BertModel(configuration)
+
+            # Accessing the model configuration
+            configuration = model.config
+
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
     """
     pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
diff --git a/src/transformers/configuration_camembert.py b/src/transformers/configuration_camembert.py
index 0063e4ada3..6765fd61b1 100644
--- a/src/transformers/configuration_camembert.py
+++ b/src/transformers/configuration_camembert.py
@@ -41,5 +41,22 @@ class CamembertConfig(RobertaConfig):
 
         The :class:`~transformers.CamembertConfig` class directly inherits :class:`~transformers.BertConfig`.
         It reuses the same defaults. Please check the parent class for more information.
+
+        Example::
+
+            from transformers import CamembertModel, CamembertConfig
+
+            # Initializing a CamemBERT configuration
+            configuration = CamembertConfig()
+
+            # Initializing a model from the configuration
+            model = CamembertModel(configuration)
+
+            # Accessing the model configuration
+            configuration = model.config
+
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
     """
     pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
diff --git a/src/transformers/configuration_ctrl.py b/src/transformers/configuration_ctrl.py
index 446ebffed7..ea1e861a18 100644
--- a/src/transformers/configuration_ctrl.py
+++ b/src/transformers/configuration_ctrl.py
@@ -63,6 +63,23 @@ class CTRLConfig(PretrainedConfig):
                 The epsilon to use in the layer normalization layers
             initializer_range (:obj:`float`, optional, defaults to 0.02):
                 The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+        Example::
+
+            from transformers import CTRLModel, CTRLConfig
+
+            # Initializing a CTRL configuration
+            configuration = CTRLConfig()
+
+            # Initializing a model from the configuration
+            model = CTRLModel(configuration)
+
+            # Accessing the model configuration
+            configuration = model.config
+
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
     """
 
     pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
diff --git a/src/transformers/configuration_distilbert.py b/src/transformers/configuration_distilbert.py
index ffad013704..a2f541b679 100644
--- a/src/transformers/configuration_distilbert.py
+++ b/src/transformers/configuration_distilbert.py
@@ -74,6 +74,23 @@ class DistilBertConfig(PretrainedConfig):
             seq_classif_dropout (:obj:`float`, optional, defaults to 0.2):
                 The dropout probabilities used in the sequence classification model
                 :class:`~tranformers.DistilBertForSequenceClassification`.
+
+        Example::
+
+            from transformers import DistilBertModel, DistilBertConfig
+
+            # Initializing a DistilBERT configuration
+            configuration = DistilBertConfig()
+
+            # Initializing a model from the configuration
+            model = DistilBertModel(configuration)
+
+            # Accessing the model configuration
+            configuration = model.config
+
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
     """
     pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
diff --git a/src/transformers/configuration_gpt2.py b/src/transformers/configuration_gpt2.py
index 2b9042bd75..26315fd1e3 100644
--- a/src/transformers/configuration_gpt2.py
+++ b/src/transformers/configuration_gpt2.py
@@ -94,6 +94,23 @@ class GPT2Config(PretrainedConfig):
                 Argument used when doing sequence summary. Used in for the multiple choice head in
                 :class:`~transformers.GPT2DoubleHeadsModel`.
                 Add a dropout before the projection and activation
+
+        Example::
+
+            from transformers import GPT2Model, GPT2Config
+
+            # Initializing a GPT2 configuration
+            configuration = GPT2Config()
+
+            # Initializing a model from the configuration
+            model = GPT2Model(configuration)
+
+            # Accessing the model configuration
+            configuration = model.config
+
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
     """
 
     pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
diff --git a/src/transformers/configuration_openai.py b/src/transformers/configuration_openai.py
index e6fc1efce3..f55c922209 100644
--- a/src/transformers/configuration_openai.py
+++ b/src/transformers/configuration_openai.py
@@ -94,6 +94,23 @@ class OpenAIGPTConfig(PretrainedConfig):
                 Argument used when doing sequence summary. Used in for the multiple choice head in
                 :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
                 Add a dropout before the projection and activation
+
+        Example::
+
+            from transformers import OpenAIGPTConfig, OpenAIGPTModel
+
+            # Initializing a GPT configuration
+            configuration = OpenAIGPTConfig()
+
+            # Initializing a model from the configuration
+            model = OpenAIGPTModel(configuration)
+
+            # Accessing the model configuration
+            configuration = model.config
+
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
     """
 
     pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
diff --git a/src/transformers/configuration_transfo_xl.py b/src/transformers/configuration_transfo_xl.py
index e4052408d2..d1f2ab42ee 100644
--- a/src/transformers/configuration_transfo_xl.py
+++ b/src/transformers/configuration_transfo_xl.py
@@ -97,6 +97,23 @@ class TransfoXLConfig(PretrainedConfig):
                 Parameters initialized by N(0, init_std)
             layer_norm_epsilon (:obj:`float`, optional, defaults to 1e-5):
                 The epsilon to use in the layer normalization layers
+
+        Example::
+
+            from transformers import TransfoXLConfig, TransfoXLModel
+
+            # Initializing a Transformer XL configuration
+            configuration = TransfoXLConfig()
+
+            # Initializing a model from the configuration
+            model = TransfoXLModel(configuration)
+
+            # Accessing the model configuration
+            configuration = model.config
+
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
     """
 
     pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
diff --git a/src/transformers/configuration_xlm.py b/src/transformers/configuration_xlm.py
index 3dc26047d8..c34c93ebde 100644
--- a/src/transformers/configuration_xlm.py
+++ b/src/transformers/configuration_xlm.py
@@ -129,7 +129,7 @@ class XLMConfig(PretrainedConfig):
                 :class:`~transformers.XLMForSequenceClassification`.
                 Add a dropout before the projection and activation
             start_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNetV.
+                Used in the SQuAD evaluation script for XLM and XLNet.
             end_n_top (:obj:`int`, optional, defaults to 5):
                 Used in the SQuAD evaluation script for XLM and XLNet.
             mask_token_id (:obj:`int`, optional, defaults to 0):
@@ -137,6 +137,23 @@ class XLMConfig(PretrainedConfig):
             lang_id (:obj:`int`, optional, defaults to 1):
                 The ID of the language used by the model. This parameter is used when generating
                 text in a given language.
+
+        Example::
+
+            from transformers import XLMConfig, XLMModel
+
+            # Initializing a XLM configuration
+            configuration = XLMConfig()
+
+            # Initializing a model from the configuration
+            model = XLMModel(configuration)
+
+            # Accessing the model configuration
+            configuration = model.config
+
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
     """
 
     pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
diff --git a/src/transformers/configuration_xlnet.py b/src/transformers/configuration_xlnet.py
index 7fe427344d..24da0ca7c7 100644
--- a/src/transformers/configuration_xlnet.py
+++ b/src/transformers/configuration_xlnet.py
@@ -106,9 +106,26 @@ class XLNetConfig(PretrainedConfig):
                 :class:`~transformers.XLNetForSequenceClassification` and :class:`~transformers.XLNetForMultipleChoice`.
                 Add a dropout after the projection and activation
             start_n_top (:obj:`int`, optional, defaults to 5):
-                Used in the SQuAD evaluation script for XLM and XLNetV.
+                Used in the SQuAD evaluation script for XLM and XLNet.
             end_n_top (:obj:`int`, optional, defaults to 5):
                 Used in the SQuAD evaluation script for XLM and XLNet.
+
+        Example::
+
+            from transformers import XLNetConfig, XLNetModel
+
+            # Initializing a XLNet configuration
+            configuration = XLNetConfig()
+
+            # Initializing a model from the configuration
+            model = XLNetModel(configuration)
+
+            # Accessing the model configuration
+            configuration = model.config
+
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
     """
 
     pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP

From 760164d63b4ba6a2add0d7e6f09347a47c4bfc41 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Mon, 13 Jan 2020 13:36:42 +0100
Subject: [PATCH 15/17] RoBERTa example

---
 src/transformers/configuration_roberta.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/transformers/configuration_roberta.py b/src/transformers/configuration_roberta.py
index a9c53d42aa..1e407e420e 100644
--- a/src/transformers/configuration_roberta.py
+++ b/src/transformers/configuration_roberta.py
@@ -46,5 +46,22 @@ class RobertaConfig(BertConfig):
 
         The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`.
         It reuses the same defaults. Please check the parent class for more information.
+
+        Example::
+
+            from transformers import RobertaConfig, RobertaModel
+
+            # Initializing a RoBERTa configuration
+            configuration = RobertaConfig()
+
+            # Initializing a model from the configuration
+            model = RobertaModel(configuration)
+
+            # Accessing the model configuration
+            configuration = model.config
+
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
     """
     pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP

From 6c32d8bb95aa81de6a047cca5ae732b93b9db020 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Tue, 14 Jan 2020 08:07:57 -0500
Subject: [PATCH 16/17] Size > Dimensionality + Remove final TODOs

---
 src/transformers/configuration_albert.py     | 6 +++---
 src/transformers/configuration_bert.py       | 4 ++--
 src/transformers/configuration_ctrl.py       | 4 ++--
 src/transformers/configuration_distilbert.py | 2 +-
 src/transformers/configuration_gpt2.py       | 2 +-
 src/transformers/configuration_openai.py     | 2 +-
 src/transformers/configuration_xlm.py        | 3 ++-
 src/transformers/configuration_xlnet.py      | 4 ++--
 8 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/transformers/configuration_albert.py b/src/transformers/configuration_albert.py
index c86107e503..c9dd6bbbbd 100644
--- a/src/transformers/configuration_albert.py
+++ b/src/transformers/configuration_albert.py
@@ -47,9 +47,9 @@ class AlbertConfig(PretrainedConfig):
                 Vocabulary size of the ALBERT model. Defines the different tokens that
                 can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.AlbertModel`.
             embedding_size (:obj:`int`, optional, defaults to 128):
-                Size of vocabulary embeddings.
+                Dimensionality of vocabulary embeddings.
             hidden_size (:obj:`int`, optional, defaults to 4096):
-                Size of the encoder layers and the pooler layer.
+                Dimensionality of the encoder layers and the pooler layer.
             num_hidden_layers (:obj:`int`, optional, defaults to 12):
                 Number of hidden layers in the Transformer encoder.
             num_hidden_groups (:obj:`int`, optional, defaults to 1):
@@ -57,7 +57,7 @@ class AlbertConfig(PretrainedConfig):
             num_attention_heads (:obj:`int`, optional, defaults to 64):
                 Number of attention heads for each attention layer in the Transformer encoder.
             intermediate_size (:obj:`int`, optional, defaults to 16384):
-                The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+                The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
             inner_group_num (:obj:`int`, optional, defaults to 1):
                 The number of inner repetition of attention and ffn.
             hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu_new"):
diff --git a/src/transformers/configuration_bert.py b/src/transformers/configuration_bert.py
index c61fc418b7..2e05d596d5 100644
--- a/src/transformers/configuration_bert.py
+++ b/src/transformers/configuration_bert.py
@@ -65,13 +65,13 @@ class BertConfig(PretrainedConfig):
                 Vocabulary size of the BERT model. Defines the different tokens that
                 can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
             hidden_size (:obj:`int`, optional, defaults to 768):
-                Size of the encoder layers and the pooler layer.
+                Dimensionality of the encoder layers and the pooler layer.
             num_hidden_layers (:obj:`int`, optional, defaults to 12):
                 Number of hidden layers in the Transformer encoder.
             num_attention_heads (:obj:`int`, optional, defaults to 12):
                 Number of attention heads for each attention layer in the Transformer encoder.
             intermediate_size (:obj:`int`, optional, defaults to 3072):
-                The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
             hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
                 The non-linear activation function (function or string) in the encoder and pooler.
                 If string, "gelu", "relu", "swish" and "gelu_new" are supported.
diff --git a/src/transformers/configuration_ctrl.py b/src/transformers/configuration_ctrl.py
index ea1e861a18..f5208fb2b6 100644
--- a/src/transformers/configuration_ctrl.py
+++ b/src/transformers/configuration_ctrl.py
@@ -44,11 +44,11 @@ class CTRLConfig(PretrainedConfig):
                 The maximum sequence length that this model might ever be used with.
                 Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
             n_ctx (:obj:`int`, optional, defaults to 256):
-                Size of the causal mask (usually same as n_positions).
+                Dimensionality of the causal mask (usually same as n_positions).
             n_embd (:obj:`int`, optional, defaults to 1280):
                 Dimensionality of the embeddings and hidden states.
             dff (:obj:`int`, optional, defaults to 8192):
-                Size of the inner dimension of the FFN.
+                Dimensionality of the inner dimension of the FFN.
             n_layer (:obj:`int`, optional, defaults to 48):
                 Number of hidden layers in the Transformer encoder.
             n_head (:obj:`int`, optional, defaults to 16):
diff --git a/src/transformers/configuration_distilbert.py b/src/transformers/configuration_distilbert.py
index a2f541b679..60b8120b02 100644
--- a/src/transformers/configuration_distilbert.py
+++ b/src/transformers/configuration_distilbert.py
@@ -56,7 +56,7 @@ class DistilBertConfig(PretrainedConfig):
             n_heads (:obj:`int`, optional, defaults to 12):
                 Number of attention heads for each attention layer in the Transformer encoder.
             dim (:obj:`int`, optional, defaults to 768):
-                Size of the encoder layers and the pooler layer.
+                Dimensionality of the encoder layers and the pooler layer.
             intermediate_size (:obj:`int`, optional, defaults to 3072):
                 The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
             dropout (:obj:`float`, optional, defaults to 0.1):
diff --git a/src/transformers/configuration_gpt2.py b/src/transformers/configuration_gpt2.py
index 26315fd1e3..0c3e33bfd7 100644
--- a/src/transformers/configuration_gpt2.py
+++ b/src/transformers/configuration_gpt2.py
@@ -52,7 +52,7 @@ class GPT2Config(PretrainedConfig):
                 The maximum sequence length that this model might ever be used with.
                 Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
             n_ctx (:obj:`int`, optional, defaults to 1024):
-                Size of the causal mask (usually same as n_positions).
+                Dimensionality of the causal mask (usually same as n_positions).
             n_embd (:obj:`int`, optional, defaults to 768):
                 Dimensionality of the embeddings and hidden states.
             n_layer (:obj:`int`, optional, defaults to 12):
diff --git a/src/transformers/configuration_openai.py b/src/transformers/configuration_openai.py
index f55c922209..78ee367d6e 100644
--- a/src/transformers/configuration_openai.py
+++ b/src/transformers/configuration_openai.py
@@ -47,7 +47,7 @@ class OpenAIGPTConfig(PretrainedConfig):
                 The maximum sequence length that this model might ever be used with.
                 Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
             n_ctx (:obj:`int`, optional, defaults to 512):
-                Size of the causal mask (usually same as n_positions).
+                Dimensionality of the causal mask (usually same as n_positions).
             n_embd (:obj:`int`, optional, defaults to 768):
                 Dimensionality of the embeddings and hidden states.
             n_layer (:obj:`int`, optional, defaults to 12):
diff --git a/src/transformers/configuration_xlm.py b/src/transformers/configuration_xlm.py
index c34c93ebde..de74f94afa 100644
--- a/src/transformers/configuration_xlm.py
+++ b/src/transformers/configuration_xlm.py
@@ -72,7 +72,8 @@ class XLMConfig(PretrainedConfig):
                 Causal models use a triangular attention mask in order to only attend to the left-side context instead
                 if a bidirectional context.
             asm (:obj:`boolean`, optional, defaults to :obj:`False`):
-                TODO
+                Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction
+                layer.
             n_langs (:obj:`int`, optional, defaults to 1):
                 The number of languages the model handles. Set to 1 for monolingual models.
             use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`)
diff --git a/src/transformers/configuration_xlnet.py b/src/transformers/configuration_xlnet.py
index 24da0ca7c7..310aec38e2 100644
--- a/src/transformers/configuration_xlnet.py
+++ b/src/transformers/configuration_xlnet.py
@@ -45,13 +45,13 @@ class XLNetConfig(PretrainedConfig):
                 Vocabulary size of the XLNet model. Defines the different tokens that
                 can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLNetModel`.
             d_model (:obj:`int`, optional, defaults to 1024):
-                Size of the encoder layers and the pooler layer.
+                Dimensionality of the encoder layers and the pooler layer.
             n_layer (:obj:`int`, optional, defaults to 24):
                 Number of hidden layers in the Transformer encoder.
             n_head (:obj:`int`, optional, defaults to 16):
                 Number of attention heads for each attention layer in the Transformer encoder.
             d_inner (:obj:`int`, optional, defaults to 4096):
-                The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
             ff_activation (:obj:`string`, optional, defaults to "gelu"):
                 The non-linear activation function (function or string) in the
                 encoder and pooler. If string, "gelu", "relu" and "swish" are supported.

From 100e3b6f2133074bf746a78eb4c3a0ad3e939b5f Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Tue, 14 Jan 2020 10:13:31 -0500
Subject: [PATCH 17/17] Bias should be resized with the weights

Created a link between the linear layer bias and the model attribute bias. This does not change anything for the user nor for the conversion scripts, but allows the `resize_token_embeddings` method to resize the bias as well as the weights of the decoder.

Added a test.
---
 src/transformers/modeling_albert.py  | 3 +++
 src/transformers/modeling_bert.py    | 3 +++
 src/transformers/modeling_roberta.py | 3 +++
 tests/test_modeling_common.py        | 7 +++++++
 4 files changed, 16 insertions(+)

diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py
index c1540bda5f..4fae225212 100644
--- a/src/transformers/modeling_albert.py
+++ b/src/transformers/modeling_albert.py
@@ -579,6 +579,9 @@ class AlbertMLMHead(nn.Module):
         self.decoder = nn.Linear(config.embedding_size, config.vocab_size)
         self.activation = ACT2FN[config.hidden_act]
 
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
     def forward(self, hidden_states):
         hidden_states = self.dense(hidden_states)
         hidden_states = self.activation(hidden_states)
diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py
index cdc46b9662..48ada95c75 100644
--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -481,6 +481,9 @@ class BertLMPredictionHead(nn.Module):
 
         self.bias = nn.Parameter(torch.zeros(config.vocab_size))
 
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
     def forward(self, hidden_states):
         hidden_states = self.transform(hidden_states)
         hidden_states = self.decoder(hidden_states) + self.bias
diff --git a/src/transformers/modeling_roberta.py b/src/transformers/modeling_roberta.py
index 56e983e01c..fc066cc7b8 100644
--- a/src/transformers/modeling_roberta.py
+++ b/src/transformers/modeling_roberta.py
@@ -306,6 +306,9 @@ class RobertaLMHead(nn.Module):
         self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.bias = nn.Parameter(torch.zeros(config.vocab_size))
 
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
     def forward(self, features, **kwargs):
         x = self.dense(features)
         x = gelu(x)
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 719debcb3c..1281febd9f 100644
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -487,6 +487,8 @@ class ModelTesterMixin:
             self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
             # Check that it actually resizes the embeddings matrix
             self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            model(**inputs_dict)
 
             # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
             model_embed = model.resize_token_embeddings(model_vocab_size - 15)
@@ -494,6 +496,11 @@ class ModelTesterMixin:
             # Check that it actually resizes the embeddings matrix
             self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
 
+            # Check that the model can still do a forward pass successfully (every parameter should be resized)
+            # Input ids should be clamped to the maximum size of the vocabulary
+            inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 1)
+            model(**inputs_dict)
+
             # Check that adding and removing tokens has not modified the first part of the embedding matrix.
             models_equal = True
             for p1, p2 in zip(cloned_embeddings, model_embed.weight):