Add layerdrop
This commit is contained in:
@@ -51,6 +51,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
|
|||||||
10. `CamemBERT <https://huggingface.co/transformers/model_doc/camembert.html>`_ (from FAIR, Inria, Sorbonne Université) released together with the paper `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`_ by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la Clergerie, Djame Seddah, and Benoît Sagot.
|
10. `CamemBERT <https://huggingface.co/transformers/model_doc/camembert.html>`_ (from FAIR, Inria, Sorbonne Université) released together with the paper `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`_ by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la Clergerie, Djame Seddah, and Benoît Sagot.
|
||||||
11. `ALBERT <https://github.com/google-research/ALBERT>`_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
|
11. `ALBERT <https://github.com/google-research/ALBERT>`_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
|
||||||
12. `XLM-RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`_ (from Facebook AI), released together with the paper `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`_ by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
|
12. `XLM-RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`_ (from Facebook AI), released together with the paper `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`_ by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
|
||||||
|
13. `FlauBERT <https://github.com/getalp/Flaubert>`_ (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model Pre-training for French <https://arxiv.org/abs/1912.05372>`_ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 2
|
:maxdepth: 2
|
||||||
|
|||||||
@@ -45,7 +45,10 @@ class FlaubertConfig(XLMConfig):
|
|||||||
Args:
|
Args:
|
||||||
pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
Whether to apply the layer normalization before or after the feed forward layer following the
|
Whether to apply the layer normalization before or after the feed forward layer following the
|
||||||
attention in each layer.
|
attention in each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
|
||||||
|
layerdrop (:obj:`float`, `optional`, defaults to 0.0):
|
||||||
|
Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand
|
||||||
|
with Structured Dropout. ICLR 2020)
|
||||||
vocab_size (:obj:`int`, optional, defaults to 30145):
|
vocab_size (:obj:`int`, optional, defaults to 30145):
|
||||||
Vocabulary size of the XLM model. Defines the different tokens that
|
Vocabulary size of the XLM model. Defines the different tokens that
|
||||||
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`.
|
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`.
|
||||||
|
|||||||
@@ -16,6 +16,7 @@
|
|||||||
|
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import random
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
@@ -113,8 +114,8 @@ class FlaubertModel(XLMModel):
|
|||||||
|
|
||||||
def __init__(self, config): # , dico, is_encoder, with_output):
|
def __init__(self, config): # , dico, is_encoder, with_output):
|
||||||
super(FlaubertModel, self).__init__(config)
|
super(FlaubertModel, self).__init__(config)
|
||||||
self.layerdrop = 0.0 if not hasattr(config, "layerdrop") else config.layerdrop
|
self.layerdrop = getattr(config, "layerdrop", 0.0)
|
||||||
self.pre_norm = False if not hasattr(config, "pre_norm") else config.pre_norm
|
self.pre_norm = getattr(config, "pre_norm", False)
|
||||||
|
|
||||||
@add_start_docstrings_to_callable(FLAUBERT_INPUTS_DOCSTRING)
|
@add_start_docstrings_to_callable(FLAUBERT_INPUTS_DOCSTRING)
|
||||||
def forward(
|
def forward(
|
||||||
@@ -243,6 +244,11 @@ class FlaubertModel(XLMModel):
|
|||||||
hidden_states = ()
|
hidden_states = ()
|
||||||
attentions = ()
|
attentions = ()
|
||||||
for i in range(self.n_layers):
|
for i in range(self.n_layers):
|
||||||
|
# LayerDrop
|
||||||
|
dropout_probability = random.uniform(0, 1)
|
||||||
|
if self.training and (dropout_probability < self.layerdrop):
|
||||||
|
continue
|
||||||
|
|
||||||
if self.output_hidden_states:
|
if self.output_hidden_states:
|
||||||
hidden_states = hidden_states + (tensor,)
|
hidden_states = hidden_states + (tensor,)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user