diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py index a8239038a7..c931ac6ab2 100644 --- a/pytorch_transformers/modeling_bert.py +++ b/pytorch_transformers/modeling_bert.py @@ -642,7 +642,7 @@ BERT_INPUTS_DOCSTRING = r""" @add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.", BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) class BertModel(BertPreTrainedModel): - r""" + __doc__ = r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` Sequence of hidden-states at the last layer of the model. @@ -738,7 +738,7 @@ class BertModel(BertPreTrainedModel): a `masked language modeling` head and a `next sentence prediction (classification)` head. """, BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) class BertForPreTraining(BertPreTrainedModel): - r""" + __doc__ = r""" **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Labels for computing the masked language modeling loss. Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) @@ -814,7 +814,7 @@ class BertForPreTraining(BertPreTrainedModel): @add_start_docstrings("""Bert Model transformer BERT model with a `language modeling` head on top. """, BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) class BertForMaskedLM(BertPreTrainedModel): - r""" + __doc__ = r""" **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Labels for computing the masked language modeling loss. Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) @@ -879,7 +879,7 @@ class BertForMaskedLM(BertPreTrainedModel): @add_start_docstrings("""Bert Model transformer BERT model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) class BertForNextSentencePrediction(BertPreTrainedModel): - r""" + __doc__ = r""" **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring) Indices should be in ``[0, 1]``. @@ -937,7 +937,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel): the pooled output) e.g. for GLUE tasks. """, BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) class BertForSequenceClassification(BertPreTrainedModel): - r""" + __doc__ = r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: Labels for computing the sequence classification/regression loss. Indices should be in ``[0, ..., config.num_labels]``. @@ -1005,7 +1005,7 @@ class BertForSequenceClassification(BertPreTrainedModel): the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, BERT_START_DOCSTRING) class BertForMultipleChoice(BertPreTrainedModel): - r""" + __doc__ = r""" Inputs: **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``: Indices of input sequence tokens in the vocabulary. @@ -1110,7 +1110,7 @@ class BertForMultipleChoice(BertPreTrainedModel): the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) class BertForTokenClassification(BertPreTrainedModel): - r""" + __doc__ = r""" **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels]``.