diff --git a/docs/source/index.rst b/docs/source/index.rst index f608cf5f58..f9ff1a0606 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -51,6 +51,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train 10. `CamemBERT `_ (from FAIR, Inria, Sorbonne Université) released together with the paper `CamemBERT: a Tasty French Language Model `_ by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la Clergerie, Djame Seddah, and Benoît Sagot. 11. `ALBERT `_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations `_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. 12. `XLM-RoBERTa `_ (from Facebook AI), released together with the paper `Unsupervised Cross-lingual Representation Learning at Scale `_ by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. +13. `FlauBERT `_ (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model Pre-training for French `_ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. .. toctree:: :maxdepth: 2 diff --git a/src/transformers/configuration_flaubert.py b/src/transformers/configuration_flaubert.py index fa4f48d3dc..273fdb1868 100644 --- a/src/transformers/configuration_flaubert.py +++ b/src/transformers/configuration_flaubert.py @@ -45,7 +45,10 @@ class FlaubertConfig(XLMConfig): Args: pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether to apply the layer normalization before or after the feed forward layer following the - attention in each layer. + attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018) + layerdrop (:obj:`float`, `optional`, defaults to 0.0): + Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand + with Structured Dropout. ICLR 2020) vocab_size (:obj:`int`, optional, defaults to 30145): Vocabulary size of the XLM model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`. diff --git a/src/transformers/modeling_flaubert.py b/src/transformers/modeling_flaubert.py index 72202dd296..d48f1273fb 100644 --- a/src/transformers/modeling_flaubert.py +++ b/src/transformers/modeling_flaubert.py @@ -16,6 +16,7 @@ import logging +import random import torch from torch.nn import functional as F @@ -113,8 +114,8 @@ class FlaubertModel(XLMModel): def __init__(self, config): # , dico, is_encoder, with_output): super(FlaubertModel, self).__init__(config) - self.layerdrop = 0.0 if not hasattr(config, "layerdrop") else config.layerdrop - self.pre_norm = False if not hasattr(config, "pre_norm") else config.pre_norm + self.layerdrop = getattr(config, "layerdrop", 0.0) + self.pre_norm = getattr(config, "pre_norm", False) @add_start_docstrings_to_callable(FLAUBERT_INPUTS_DOCSTRING) def forward( @@ -243,6 +244,11 @@ class FlaubertModel(XLMModel): hidden_states = () attentions = () for i in range(self.n_layers): + # LayerDrop + dropout_probability = random.uniform(0, 1) + if self.training and (dropout_probability < self.layerdrop): + continue + if self.output_hidden_states: hidden_states = hidden_states + (tensor,)