From 73306d028b379694b8eda56902ef735215cb570a Mon Sep 17 00:00:00 2001 From: Lysandre Date: Wed, 29 Jan 2020 15:16:22 -0500 Subject: [PATCH] FlauBERT documentation --- docs/source/index.rst | 3 +- docs/source/model_doc/flaubert.rst | 72 ++++++ src/transformers/configuration_flaubert.py | 139 ++++++++--- src/transformers/modeling_flaubert.py | 274 ++++++--------------- 4 files changed, 247 insertions(+), 241 deletions(-) create mode 100644 docs/source/model_doc/flaubert.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index 988d530134..f608cf5f58 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -97,4 +97,5 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train model_doc/ctrl model_doc/camembert model_doc/albert - model_doc/xlmroberta \ No newline at end of file + model_doc/xlmroberta + model_doc/flaubert \ No newline at end of file diff --git a/docs/source/model_doc/flaubert.rst b/docs/source/model_doc/flaubert.rst new file mode 100644 index 0000000000..d0211306ee --- /dev/null +++ b/docs/source/model_doc/flaubert.rst @@ -0,0 +1,72 @@ +FlauBERT +---------------------------------------------------- + +The FlauBERT model was proposed in the paper +`FlauBERT: Unsupervised Language Model Pre-training for French `__ by Hang Le et al. +It's a transformer pre-trained using a masked language modeling (MLM) objective (BERT-like). + +The abstract from the paper is the following: + +*Language models have become a key step to achieve state-of-the art results in many different Natural Language +Processing (NLP) tasks. Leveraging the huge amount of unlabeled texts nowadays available, they provide an efficient +way to pre-train continuous word representations that can be fine-tuned for a downstream task, along with their +contextualization at the sentence level. This has been widely demonstrated for English using contextualized +representations (Dai and Le, 2015; Peters et al., 2018; Howard and Ruder, 2018; Radford et al., 2018; Devlin et +al., 2019; Yang et al., 2019b). In this paper, we introduce and share FlauBERT, a model learned on a very large +and heterogeneous French corpus. Models of different sizes are trained using the new CNRS (French National Centre +for Scientific Research) Jean Zay supercomputer. We apply our French language models to diverse NLP tasks (text +classification, paraphrasing, natural language inference, parsing, word sense disambiguation) and show that most +of the time they outperform other pre-training approaches. Different versions of FlauBERT as well as a unified +evaluation protocol for the downstream tasks, called FLUE (French Language Understanding Evaluation), are shared +to the research community for further reproducible experiments in French NLP.* + + +FlaubertConfig +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.FlaubertConfig + :members: + + +FlaubertTokenizer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.FlaubertTokenizer + :members: + + +FlaubertModel +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.FlaubertModel + :members: + + +FlaubertWithLMHeadModel +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.FlaubertWithLMHeadModel + :members: + + +FlaubertForSequenceClassification +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.FlaubertForSequenceClassification + :members: + + +FlaubertForQuestionAnsweringSimple +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.FlaubertForQuestionAnsweringSimple + :members: + + +FlaubertForQuestionAnswering +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.FlaubertForQuestionAnswering + :members: + + diff --git a/src/transformers/configuration_flaubert.py b/src/transformers/configuration_flaubert.py index ba7e8df346..fa4f48d3dc 100644 --- a/src/transformers/configuration_flaubert.py +++ b/src/transformers/configuration_flaubert.py @@ -31,44 +31,111 @@ FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { class FlaubertConfig(XLMConfig): - """Configuration class to store the configuration of a `FlaubertModel`. + """ + Configuration class to store the configuration of a `FlaubertModel`. + This is the configuration class to store the configuration of a :class:`~transformers.XLMModel`. + It is used to instantiate an XLM model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the `xlm-mlm-en-2048 `__ architecture. - Args: - vocab_size: Vocabulary size of `inputs_ids` in `FlaubertModel`. - d_model: Size of the encoder layers and the pooler layer. - n_layer: Number of hidden layers in the Transformer encoder. - n_head: Number of attention heads for each attention layer in - the Transformer encoder. - d_inner: The size of the "intermediate" (i.e., feed-forward) - layer in the Transformer encoder. - ff_activation: The non-linear activation function (function or string) in the - encoder and pooler. If string, "gelu", "relu" and "swish" are supported. - untie_r: untie relative position biases - attn_type: 'bi' for Flaubert, 'uni' for Transformer-XL + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. - dropout: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - max_position_embeddings: The maximum sequence length that this model might - ever be used with. Typically set this to something large just in case - (e.g., 512 or 1024 or 2048). - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. - layer_norm_eps: The epsilon used by LayerNorm. - - dropout: float, dropout rate. - init: str, the initialization scheme, either "normal" or "uniform". - init_range: float, initialize the parameters with a uniform distribution - in [-init_range, init_range]. Only effective when init="uniform". - init_std: float, initialize the parameters with a normal distribution - with mean 0 and stddev init_std. Only effective when init="normal". - mem_len: int, the number of tokens to cache. - reuse_len: int, the number of tokens in the currect batch to be cached - and reused in the future. - bi_data: bool, whether to use bidirectional input pipeline. - Usually set to True during pretraining and False during finetuning. - clamp_len: int, clamp all relative distances larger than clamp_len. - -1 means no clamping. - same_length: bool, whether to use the same attention length for each token. + Args: + pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to apply the layer normalization before or after the feed forward layer following the + attention in each layer. + vocab_size (:obj:`int`, optional, defaults to 30145): + Vocabulary size of the XLM model. Defines the different tokens that + can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.XLMModel`. + emb_dim (:obj:`int`, optional, defaults to 2048): + Dimensionality of the encoder layers and the pooler layer. + n_layer (:obj:`int`, optional, defaults to 12): + Number of hidden layers in the Transformer encoder. + n_head (:obj:`int`, optional, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + dropout (:obj:`float`, optional, defaults to 0.1): + The dropout probability for all fully connected + layers in the embeddings, encoder, and pooler. + attention_dropout (:obj:`float`, optional, defaults to 0.1): + The dropout probability for the attention mechanism + gelu_activation (:obj:`boolean`, optional, defaults to :obj:`True`): + The non-linear activation function (function or string) in the + encoder and pooler. If set to `True`, "gelu" will be used instead of "relu". + sinusoidal_embeddings (:obj:`boolean`, optional, defaults to :obj:`False`): + Whether to use sinusoidal positional embeddings instead of absolute positional embeddings. + causal (:obj:`boolean`, optional, defaults to :obj:`False`): + Set this to `True` for the model to behave in a causal manner. + Causal models use a triangular attention mask in order to only attend to the left-side context instead + if a bidirectional context. + asm (:obj:`boolean`, optional, defaults to :obj:`False`): + Whether to use an adaptive log softmax projection layer instead of a linear layer for the prediction + layer. + n_langs (:obj:`int`, optional, defaults to 1): + The number of languages the model handles. Set to 1 for monolingual models. + use_lang_emb (:obj:`boolean`, optional, defaults to :obj:`True`) + Whether to use language embeddings. Some models use additional language embeddings, see + `the multilingual models page `__ + for information on how to use them. + max_position_embeddings (:obj:`int`, optional, defaults to 512): + The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + embed_init_std (:obj:`float`, optional, defaults to 2048^-0.5): + The standard deviation of the truncated_normal_initializer for + initializing the embedding matrices. + init_std (:obj:`int`, optional, defaults to 50257): + The standard deviation of the truncated_normal_initializer for + initializing all weight matrices except the embedding matrices. + layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): + The epsilon used by the layer normalization layers. + bos_index (:obj:`int`, optional, defaults to 0): + The index of the beginning of sentence token in the vocabulary. + eos_index (:obj:`int`, optional, defaults to 1): + The index of the end of sentence token in the vocabulary. + pad_index (:obj:`int`, optional, defaults to 2): + The index of the padding token in the vocabulary. + unk_index (:obj:`int`, optional, defaults to 3): + The index of the unknown token in the vocabulary. + mask_index (:obj:`int`, optional, defaults to 5): + The index of the masking token in the vocabulary. + is_encoder(:obj:`boolean`, optional, defaults to :obj:`True`): + Whether the initialized model should be a transformer encoder or decoder as seen in Vaswani et al. + summary_type (:obj:`string`, optional, defaults to "first"): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:`~transformers.XLMForSequenceClassification`. + Is one of the following options: + - 'last' => take the last token hidden state (like XLNet) + - 'first' => take the first token hidden state (like Bert) + - 'mean' => take the mean of all tokens hidden states + - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) + - 'attn' => Not implemented now, use multi-head attention + summary_use_proj (:obj:`boolean`, optional, defaults to :obj:`True`): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:`~transformers.XLMForSequenceClassification`. + Add a projection after the vector extraction + summary_activation (:obj:`string` or :obj:`None`, optional, defaults to :obj:`None`): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:`~transformers.XLMForSequenceClassification`. + 'tanh' => add a tanh activation to the output, Other => no activation. + summary_proj_to_labels (:obj:`boolean`, optional, defaults to :obj:`True`): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:`~transformers.XLMForSequenceClassification`. + If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. + summary_first_dropout (:obj:`float`, optional, defaults to 0.1): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:`~transformers.XLMForSequenceClassification`. + Add a dropout before the projection and activation + start_n_top (:obj:`int`, optional, defaults to 5): + Used in the SQuAD evaluation script for XLM and XLNet. + end_n_top (:obj:`int`, optional, defaults to 5): + Used in the SQuAD evaluation script for XLM and XLNet. + mask_token_id (:obj:`int`, optional, defaults to 0): + Model agnostic parameter to identify masked tokens when generating text in an MLM context. + lang_id (:obj:`int`, optional, defaults to 1): + The ID of the language used by the model. This parameter is used when generating + text in a given language. """ pretrained_config_archive_map = FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP diff --git a/src/transformers/modeling_flaubert.py b/src/transformers/modeling_flaubert.py index 41403884af..b187c8076a 100644 --- a/src/transformers/modeling_flaubert.py +++ b/src/transformers/modeling_flaubert.py @@ -21,7 +21,7 @@ import torch from torch.nn import functional as F from .configuration_flaubert import FlaubertConfig -from .file_utils import add_start_docstrings +from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_xlm import ( XLMForQuestionAnswering, XLMForQuestionAnsweringSimple, @@ -42,24 +42,11 @@ FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP = { } -FLAUBERT_START_DOCSTRING = r""" The Flaubert model was proposed in - `FlauBERT: Unsupervised Language Model Pre-training for French`_ - by Hang Le et al. It's a transformer pre-trained using a masked - language modeling (MLM) objective (BERT-like). +FLAUBERT_START_DOCSTRING = r""" - Original code can be found `here`_. - - This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and - refer to the PyTorch documentation for all matters related to general usage and behavior. - - .. _`FlauBERT: Unsupervised Language Model Pre-training for French`: - https://arxiv.org/abs/1912.05372 - - .. _`torch.nn.Module`: - https://pytorch.org/docs/stable/nn.html#module - - .. _`here`: - https://github.com/getalp/Flaubert + This model is a PyTorch `torch.nn.Module `_ sub-class. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general + usage and behavior. Parameters: config (:class:`~transformers.FlaubertConfig`): Model configuration class with all the parameters of the model. @@ -68,42 +55,47 @@ FLAUBERT_START_DOCSTRING = r""" The Flaubert model was proposed in """ FLAUBERT_INPUTS_DOCSTRING = r""" - Inputs: - **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. - Flaubert is a model with absolute position embeddings so it's usually advised to pad the inputs on - the right rather than the left. - - Indices can be obtained using :class:`transformers.FlaubertTokenizer`. + Indices can be obtained using :class:`transformers.BertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. - **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: + :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. - **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: - A parallel sequence of tokens (can be used to indicate various portions of the inputs). - The embeddings from these tokens will be summed with the respective token embeddings. - Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices). - **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + + `What are attention masks? <../glossary.html#attention-mask>`__ + token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Segment token indices to indicate first and second portions of the inputs. + Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` + corresponds to a `sentence B` token + + `What are token type IDs? <../glossary.html#token-type-ids>`_ + position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, config.max_position_embeddings - 1]``. - **lengths**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + + `What are position IDs? <../glossary.html#position-ids>`_ + lengths (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Length of each sentence that can be used to avoid performing attention on padding token indices. You can also use `attention_mask` for the same result (see above), kept here for compatbility. Indices selected in ``[0, ..., input_ids.size(-1)]``: - **cache**: + cache (:obj:`Dict[str, torch.FloatTensor]`, `optional`, defaults to :obj:`None`): dictionary with ``torch.FloatTensor`` that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential decoding. The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states. - **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: + head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: - ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. - **inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``: - Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation. + :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. + input_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. """ @@ -112,30 +104,8 @@ FLAUBERT_INPUTS_DOCSTRING = r""" @add_start_docstrings( "The bare Flaubert Model transformer outputting raw hidden-states without any specific head on top.", FLAUBERT_START_DOCSTRING, - FLAUBERT_INPUTS_DOCSTRING, ) class FlaubertModel(XLMModel): - r""" - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` - Sequence of hidden-states at the last layer of the model. - **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) - list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) - of shape ``(batch_size, sequence_length, hidden_size)``: - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - **attentions**: (`optional`, returned when ``config.output_attentions=True``) - list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - - Examples:: - - tokenizer = FlaubertTokenizer.from_pretrained('flaubert-base-cased') - model = FlaubertModel.from_pretrained('flaubert-base-cased') - input_ids = torch.tensor(tokenizer.encode("Le chat manges une pomme.", add_special_tokens=True)).unsqueeze(0) # Batch size 1 - outputs = model(input_ids) - last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple - - """ config_class = FlaubertConfig pretrained_model_archive_map = FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP @@ -146,6 +116,7 @@ class FlaubertModel(XLMModel): self.layerdrop = 0.0 if not hasattr(config, "layerdrop") else config.layerdrop self.pre_norm = False if not hasattr(config, "pre_norm") else config.pre_norm + @add_start_docstrings_to_callable(FLAUBERT_INPUTS_DOCSTRING) def forward( self, input_ids=None, @@ -157,7 +128,34 @@ class FlaubertModel(XLMModel): cache=None, head_mask=None, inputs_embeds=None, - ): # removed: src_enc=None, src_len=None + ): + r""" + Return: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.XLMConfig`) and inputs: + last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + + Examples:: + + tokenizer = FlaubertTokenizer.from_pretrained('flaubert-base-cased') + model = FlaubertModel.from_pretrained('flaubert-base-cased') + input_ids = torch.tensor(tokenizer.encode("Le chat manges une pomme.", add_special_tokens=True)).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + + """ + # removed: src_enc=None, src_len=None if input_ids is not None: bs, slen = input_ids.size() else: @@ -306,38 +304,11 @@ class FlaubertModel(XLMModel): """The Flaubert Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings). """, FLAUBERT_START_DOCSTRING, - FLAUBERT_INPUTS_DOCSTRING, ) class FlaubertWithLMHeadModel(XLMWithLMHeadModel): - r""" - **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: - Labels for language modeling. - Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` - Indices are selected in ``[-1, 0, ..., config.vocab_size]`` - All labels set to ``-100`` are ignored (masked), the loss is only - computed for labels in ``[0, ..., config.vocab_size]`` - - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: - Language modeling loss. - **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) - list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) - of shape ``(batch_size, sequence_length, hidden_size)``: - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - **attentions**: (`optional`, returned when ``config.output_attentions=True``) - list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - - Examples:: - - tokenizer = FlaubertTokenizer.from_pretrained('flaubert-base-cased') - model = FlaubertWithLMHeadModel.from_pretrained('flaubert-base-cased') - input_ids = torch.tensor(tokenizer.encode("Le chat manges une pomme.", add_special_tokens=True)).unsqueeze(0) # Batch size 1 - outputs = model(input_ids) - last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple - + """ + This class overrides :class:`~transformers.XLMWithLMHeadModel`. Please check the + superclass for the appropriate documentation alongside usage examples. """ config_class = FlaubertConfig pretrained_model_archive_map = FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP @@ -352,38 +323,11 @@ class FlaubertWithLMHeadModel(XLMWithLMHeadModel): """Flaubert Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, FLAUBERT_START_DOCSTRING, - FLAUBERT_INPUTS_DOCSTRING, ) class FlaubertForSequenceClassification(XLMForSequenceClassification): - r""" - **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: - Labels for computing the sequence classification/regression loss. - Indices should be in ``[0, ..., config.num_labels - 1]``. - If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), - If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). - - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: - Classification (or regression if config.num_labels==1) loss. - **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` - Classification (or regression if config.num_labels==1) scores (before SoftMax). - **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) - list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) - of shape ``(batch_size, sequence_length, hidden_size)``: - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - **attentions**: (`optional`, returned when ``config.output_attentions=True``) - list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - - Examples:: - - tokenizer = FlaubertTokenizer.from_pretrained('flaubert-base-cased') - model = FlaubertForSequenceClassification.from_pretrained('flaubert-base-cased') - input_ids = torch.tensor(tokenizer.encode("Le chat manges une pomme.", add_special_tokens=True)).unsqueeze(0) # Batch size 1 - labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 - outputs = model(input_ids, labels=labels) - loss, logits = outputs[:2] - + """ + This class overrides :class:`~transformers.XLMForSequenceClassification`. Please check the + superclass for the appropriate documentation alongside usage examples. """ config_class = FlaubertConfig pretrained_model_archive_map = FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP @@ -398,50 +342,11 @@ class FlaubertForSequenceClassification(XLMForSequenceClassification): """Flaubert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, FLAUBERT_START_DOCSTRING, - FLAUBERT_INPUTS_DOCSTRING, ) class FlaubertForQuestionAnsweringSimple(XLMForQuestionAnsweringSimple): - r""" - **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: - Labels for position (index) of the start of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (`sequence_length`). - Position outside of the sequence are not taken into account for computing the loss. - **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: - Labels for position (index) of the end of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (`sequence_length`). - Position outside of the sequence are not taken into account for computing the loss. - **is_impossible**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: - Labels whether a question has an answer or no answer (SQuAD 2.0) - **cls_index**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: - Labels for position (index) of the classification token to use as input for computing plausibility of the answer. - **p_mask**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: - Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...) - - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: - Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. - **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` - Span-start scores (before SoftMax). - **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` - Span-end scores (before SoftMax). - **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) - list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) - of shape ``(batch_size, sequence_length, hidden_size)``: - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - **attentions**: (`optional`, returned when ``config.output_attentions=True``) - list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - - Examples:: - - tokenizer = FlaubertTokenizer.from_pretrained('flaubert-base-cased') - model = FlaubertForQuestionAnsweringSimple.from_pretrained('flaubert-base-cased') - input_ids = torch.tensor(tokenizer.encode("Le chat manges une pomme", add_special_tokens=True)).unsqueeze(0) # Batch size 1 - start_positions = torch.tensor([1]) - end_positions = torch.tensor([3]) - outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) - loss, start_scores, end_scores = outputs[:2] - + """ + This class overrides :class:`~transformers.XLMForQuestionAnsweringSimple`. Please check the + superclass for the appropriate documentation alongside usage examples. """ config_class = FlaubertConfig pretrained_model_archive_map = FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP @@ -456,50 +361,11 @@ class FlaubertForQuestionAnsweringSimple(XLMForQuestionAnsweringSimple): """Flaubert Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, FLAUBERT_START_DOCSTRING, - FLAUBERT_INPUTS_DOCSTRING, ) class FlaubertForQuestionAnswering(XLMForQuestionAnswering): - r""" - **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: - Labels for position (index) of the start of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (`sequence_length`). - Position outside of the sequence are not taken into account for computing the loss. - **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: - Labels for position (index) of the end of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (`sequence_length`). - Position outside of the sequence are not taken into account for computing the loss. - **is_impossible**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: - Labels whether a question has an answer or no answer (SQuAD 2.0) - **cls_index**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: - Labels for position (index) of the classification token to use as input for computing plausibility of the answer. - **p_mask**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: - Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...) - - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: - Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. - **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` - Span-start scores (before SoftMax). - **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` - Span-end scores (before SoftMax). - **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) - list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) - of shape ``(batch_size, sequence_length, hidden_size)``: - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - **attentions**: (`optional`, returned when ``config.output_attentions=True``) - list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - - Examples:: - - tokenizer = FlaubertTokenizer.from_pretrained('flaubert-base-cased') - model = FlaubertForQuestionAnswering.from_pretrained('flaubert-base-cased') - input_ids = torch.tensor(tokenizer.encode("Le chat manges une pomme.", add_special_tokens=True)).unsqueeze(0) # Batch size 1 - start_positions = torch.tensor([1]) - end_positions = torch.tensor([3]) - outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) - loss, start_scores, end_scores = outputs[:2] - + """ + This class overrides :class:`~transformers.XLMForQuestionAnswering`. Please check the + superclass for the appropriate documentation alongside usage examples. """ config_class = FlaubertConfig pretrained_model_archive_map = FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP