diff --git a/docs/source/index.rst b/docs/source/index.rst
index f608cf5f58..f9ff1a0606 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -51,6 +51,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
 10. `CamemBERT <https://huggingface.co/transformers/model_doc/camembert.html>`_ (from FAIR, Inria, Sorbonne Université) released together with the paper `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`_ by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la Clergerie, Djame Seddah, and Benoît Sagot.
 11. `ALBERT <https://github.com/google-research/ALBERT>`_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
 12. `XLM-RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`_ (from Facebook AI), released together with the paper `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`_ by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+13. `FlauBERT <https://github.com/getalp/Flaubert>`_ (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model Pre-training for French <https://arxiv.org/abs/1912.05372>`_ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 
 .. toctree::
     :maxdepth: 2
diff --git a/src/transformers/configuration_flaubert.py b/src/transformers/configuration_flaubert.py
index fa4f48d3dc..273fdb1868 100644
--- a/src/transformers/configuration_flaubert.py
+++ b/src/transformers/configuration_flaubert.py
@@ -45,7 +45,10 @@ class FlaubertConfig(XLMConfig):
         Args:
             pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
                 Whether to apply the layer normalization before or after the feed forward layer following the
-                attention in each layer.
+                attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
+            layerdrop (:obj:`float`, `optional`, defaults to 0.0):
+                Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
+                with Structured Dropout. ICLR 2020)
             vocab_size (:obj:`int`, optional, defaults to 30145):
                 Vocabulary size of the XLM model. Defines the different tokens that
                 can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`.
diff --git a/src/transformers/modeling_flaubert.py b/src/transformers/modeling_flaubert.py
index 72202dd296..d48f1273fb 100644
--- a/src/transformers/modeling_flaubert.py
+++ b/src/transformers/modeling_flaubert.py
@@ -16,6 +16,7 @@
 
 
 import logging
+import random
 
 import torch
 from torch.nn import functional as F
@@ -113,8 +114,8 @@ class FlaubertModel(XLMModel):
 
     def __init__(self, config):  # , dico, is_encoder, with_output):
         super(FlaubertModel, self).__init__(config)
-        self.layerdrop = 0.0 if not hasattr(config, "layerdrop") else config.layerdrop
-        self.pre_norm = False if not hasattr(config, "pre_norm") else config.pre_norm
+        self.layerdrop = getattr(config, "layerdrop", 0.0)
+        self.pre_norm = getattr(config, "pre_norm", False)
 
     @add_start_docstrings_to_callable(FLAUBERT_INPUTS_DOCSTRING)
     def forward(
@@ -243,6 +244,11 @@ class FlaubertModel(XLMModel):
         hidden_states = ()
         attentions = ()
         for i in range(self.n_layers):
+            # LayerDrop
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
             if self.output_hidden_states:
                 hidden_states = hidden_states + (tensor,)